Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 2645b72

Browse files
committed
[SYCL] Add test cases for SYCL-2020 reductions used in read_write mode
These changes verify intel/llvm#3315 Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 3d50724 commit 2645b72

10 files changed

+173
-72
lines changed

SYCL/Reduction/reduction_nd_N_vars.cpp

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212
// RUN: %GPU_RUN_PLACEHOLDER %t.out
1313
// RUN: %ACC_RUN_PLACEHOLDER %t.out
1414

15+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DTEST_SYCL2020_REDUCTIONS %s -o %t2020.out
16+
// RUN: %CPU_RUN_PLACEHOLDER %t2020.out
17+
// RUN: %GPU_RUN_PLACEHOLDER %t2020.out
18+
// RUN: %ACC_RUN_PLACEHOLDER %t2020.out
19+
1520
// This test checks handling of parallel_for() accepting nd_range and
1621
// two or more reductions.
1722

@@ -125,10 +130,22 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
125130
auto In3 = InBuf3.template get_access<access::mode::read>(CGH);
126131
auto In4 = InBuf4.template get_access<access::mode::read>(CGH);
127132

133+
#ifdef TEST_SYCL2020_REDUCTIONS
134+
auto Redu1 = sycl::reduction(OutBuf1, CGH, IdentityVal1, BOp1);
135+
auto Redu2 = sycl::reduction(OutBuf2, CGH, IdentityVal2, BOp2);
136+
auto Redu3 = sycl::reduction(OutBuf3, CGH, IdentityVal3, BOp3);
137+
auto Redu4 = sycl::reduction(Out4, IdentityVal4, BOp4);
138+
#else
128139
auto Out1 = OutBuf1.template get_access<Mode1>(CGH);
129140
auto Out2 = OutBuf2.template get_access<Mode2>(CGH);
130141
accessor<T3, 0, Mode3, access::target::global_buffer> Out3(OutBuf3, CGH);
131142

143+
auto Redu1 = ONEAPI::reduction(Out1, IdentityVal1, BOp1);
144+
auto Redu2 = ONEAPI::reduction(Out2, IdentityVal2, BOp2);
145+
auto Redu3 = ONEAPI::reduction(Out3, IdentityVal3, BOp3);
146+
auto Redu4 = ONEAPI::reduction(Out4, IdentityVal4, BOp4);
147+
#endif
148+
132149
auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
133150
auto &Sum4) {
134151
size_t I = NDIt.get_global_id(0);
@@ -138,15 +155,6 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
138155
Sum4.combine(In4[I]);
139156
};
140157

141-
auto Redu1 =
142-
ONEAPI::reduction<T1, BinaryOperation1>(Out1, IdentityVal1, BOp1);
143-
auto Redu2 =
144-
ONEAPI::reduction<T2, BinaryOperation2>(Out2, IdentityVal2, BOp2);
145-
auto Redu3 =
146-
ONEAPI::reduction<T3, BinaryOperation3>(Out3, IdentityVal3, BOp3);
147-
auto Redu4 =
148-
ONEAPI::reduction<T4, BinaryOperation4>(Out4, IdentityVal4, BOp4);
149-
150158
auto NDR = nd_range<1>{range<1>(NWorkItems), range<1>{WGSize}};
151159
CGH.parallel_for<ReductionExample>(NDR, Redu1, Redu2, Redu3, Redu4,
152160
Lambda);
@@ -186,17 +194,23 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
186194
}
187195

188196
int main() {
189-
int Error =
190-
runTest<class ReduFloatPlus16x1, float, access::mode::discard_write, int,
191-
access::mode::read_write, short, access::mode::read_write, int>(
192-
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000,
193-
std::bit_or<>{}, 0, 8000, std::bit_xor<>{}, usm::alloc::shared, 16,
194-
16);
197+
constexpr access::mode ReadWriteMode = access::mode::read_write;
198+
#ifdef TEST_SYCL2020_REDUCTIONS
199+
// TODO: property::reduction::initialize_to_identity is not supported yet.
200+
// Thus only read_write mode is tested now.
201+
constexpr access::mode DiscardWriteMode = access::mode::read_write;
202+
#else
203+
constexpr access::mode DiscardWriteMode = access::mode::discard_write;
204+
#endif
205+
206+
int Error = runTest<class ReduFloatPlus16x1, float, DiscardWriteMode, int,
207+
ReadWriteMode, short, ReadWriteMode, int>(
208+
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000,
209+
std::bit_or<>{}, 0, 8000, std::bit_xor<>{}, usm::alloc::shared, 16, 16);
195210

196211
auto Add = [](auto x, auto y) { return (x + y); };
197-
Error += runTest<class ReduFloatPlus5x257, float, access::mode::read_write,
198-
int, access::mode::read_write, short,
199-
access::mode::discard_write, int>(
212+
Error += runTest<class ReduFloatPlus5x257, float, ReadWriteMode, int,
213+
ReadWriteMode, short, DiscardWriteMode, int>(
200214
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000, Add, 0,
201215
8000, std::bit_xor<int>{}, usm::alloc::device, 5 * (256 + 1), 5);
202216

SYCL/Reduction/reduction_nd_ext_double.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,14 @@
33
// RUN: %GPU_RUN_PLACEHOLDER %t.out
44
// RUN: %ACC_RUN_PLACEHOLDER %t.out
55

6+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DTEST_SYCL2020_REDUCTIONS %s -o %t2020.out
7+
// RUN: %CPU_RUN_PLACEHOLDER %t2020.out
8+
// RUN: %GPU_RUN_PLACEHOLDER %t2020.out
9+
// RUN: %ACC_RUN_PLACEHOLDER %t2020.out
10+
611
// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
712
// RUNx: %HOST_RUN_PLACEHOLDER %t.out
13+
// RUNx: %HOST_RUN_PLACEHOLDER %t2020.out
814

915
// This test performs basic checks of parallel_for(nd_range, reduction, func)
1016
// used with 'double' type.

SYCL/Reduction/reduction_nd_ext_half.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
22
// RUN: %GPU_RUN_PLACEHOLDER %t.out
33

4+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DTEST_SYCL2020_REDUCTIONS %s -o %t2020.out
5+
// RUN: %GPU_RUN_PLACEHOLDER %t2020.out
6+
47
// TODO: Enable the test for CPU/ACC when they support half type.
58
// RUNx: %CPU_RUN_PLACEHOLDER %t.out
69
// RUNx: %ACC_RUN_PLACEHOLDER %t.out
10+
// RUNx: %CPU_RUN_PLACEHOLDER %t2020.out
11+
// RUNx: %ACC_RUN_PLACEHOLDER %t2020.out
712

813
// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
914
// RUNx: %HOST_RUN_PLACEHOLDER %t.out
15+
// RUNx: %HOST_RUN_PLACEHOLDER %t2020.out
1016

1117
// This test performs basic checks of parallel_for(nd_range, reduction, func)
1218
// used with 'half' type.

SYCL/Reduction/reduction_nd_ext_type.hpp

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,12 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
2626
queue Q;
2727
Q.submit([&](handler &CGH) {
2828
auto In = InBuf.template get_access<access::mode::read>(CGH);
29+
#ifdef TEST_SYCL2020_REDUCTIONS
30+
auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp);
31+
#else
2932
accessor<T, Dim, Mode, access::target::global_buffer> Out(OutBuf, CGH);
3033
auto Redu = ONEAPI::reduction(Out, Identity, BOp);
34+
#endif
3135

3236
range<1> GlobalRange(NWItems);
3337
range<1> LocalRange(WGSize);
@@ -59,25 +63,29 @@ template <typename T> int runTests(const string_class &ExtensionName) {
5963
return 0;
6064
}
6165

66+
constexpr access::mode ReadWriteMode = access::mode::read_write;
67+
#ifdef TEST_SYCL2020_REDUCTIONS
68+
// TODO: property::reduction::initialize_to_identity is not supported yet.
69+
// Thus only read_write mode is tested now.
70+
constexpr access::mode DiscardWriteMode = access::mode::read_write;
71+
#else
72+
constexpr access::mode DiscardWriteMode = access::mode::discard_write;
73+
#endif
74+
6275
// Check some less standards WG sizes and corner cases first.
63-
test<class KernelName_oTh, T, 1, access::mode::read_write,
64-
std::multiplies<T>>(0, 4, 4);
65-
test<class KernelName_QUQnMARQT, T, 0, access::mode::discard_write,
66-
ONEAPI::plus<T>>(0, 4, 64);
76+
test<class A, T, 1, ReadWriteMode, std::multiplies<T>>(0, 4, 4);
77+
test<class B, T, 0, DiscardWriteMode, ONEAPI::plus<T>>(0, 4, 64);
6778

68-
test<class KernelName_xGixNo, T, 0, access::mode::read_write,
69-
ONEAPI::minimum<T>>(getMaximumFPValue<T>(), 7, 7);
70-
test<class KernelName_qXNFw, T, 1, access::mode::discard_write,
71-
ONEAPI::maximum<T>>(getMinimumFPValue<T>(), 7, 7 * 5);
79+
test<class C, T, 0, ReadWriteMode, ONEAPI::minimum<T>>(getMaximumFPValue<T>(),
80+
7, 7);
81+
test<class D, T, 1, access::mode::discard_write, ONEAPI::maximum<T>>(
82+
getMinimumFPValue<T>(), 7, 7 * 5);
7283

73-
#if __cplusplus >= 201402L
74-
test<class KernelName_lXdWtzANdDcvm, T, 1, access::mode::read_write,
75-
ONEAPI::plus<>>(1, 3, 3 * 5);
76-
test<class KernelName_FDQalsDxmbi, T, 1, access::mode::discard_write,
77-
ONEAPI::minimum<>>(getMaximumFPValue<T>(), 3, 3);
78-
test<class KernelName_TaNRRxDRXbzYrFImPYC, T, 0, access::mode::discard_write,
79-
ONEAPI::maximum<>>(getMinimumFPValue<T>(), 3, 3);
80-
#endif // __cplusplus >= 201402L
84+
test<class E, T, 1, ReadWriteMode, ONEAPI::plus<>>(1, 3, 3 * 5);
85+
test<class F, T, 1, DiscardWriteMode, ONEAPI::minimum<>>(
86+
getMaximumFPValue<T>(), 3, 3);
87+
test<class G, T, 0, DiscardWriteMode, ONEAPI::maximum<>>(
88+
getMinimumFPValue<T>(), 3, 3);
8189

8290
std::cout << "Test passed\n";
8391
return 0;

SYCL/Reduction/reduction_nd_s0_rw.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2-
// RUNx: %HOST_RUN_PLACEHOLDER %t.out
32
// RUN: %CPU_RUN_PLACEHOLDER %t.out
43
// RUN: %GPU_RUN_PLACEHOLDER %t.out
54
// RUN: %ACC_RUN_PLACEHOLDER %t.out
65

6+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DTEST_SYCL2020_REDUCTIONS %s -o %t2020.out
7+
// RUN: %CPU_RUN_PLACEHOLDER %t2020.out
8+
// RUN: %GPU_RUN_PLACEHOLDER %t2020.out
9+
// RUN: %ACC_RUN_PLACEHOLDER %t2020.out
10+
711
// This test performs basic checks of parallel_for(nd_range, reduction, func)
812
// with reductions initialized with 0-dimensional read_write accessor.
913

@@ -13,11 +17,15 @@
1317

1418
using namespace cl::sycl;
1519

20+
// This allocator is needed only for the purpose of testing buffers
21+
// with allocator that is not same_as sycl::buffer_allocator.
22+
struct CustomAllocator : public sycl::buffer_allocator {};
23+
1624
template <typename SpecializationKernelName, typename T, int Dim,
1725
class BinaryOperation>
1826
void test(T Identity, size_t WGSize, size_t NWItems) {
1927
buffer<T, 1> InBuf(NWItems);
20-
buffer<T, 1> OutBuf(1);
28+
buffer<T, 1, CustomAllocator> OutBuf(1);
2129

2230
// Initialize.
2331
BinaryOperation BOp;
@@ -30,9 +38,13 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
3038
queue Q;
3139
Q.submit([&](handler &CGH) {
3240
auto In = InBuf.template get_access<access::mode::read>(CGH);
41+
#ifdef TEST_SYCL2020_REDUCTIONS
42+
auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp);
43+
#else
3344
accessor<T, Dim, access::mode::read_write, access::target::global_buffer>
3445
Out(OutBuf, CGH);
3546
auto Redu = ONEAPI::reduction(Out, Identity, BOp);
47+
#endif
3648

3749
range<1> GlobalRange(NWItems);
3850
range<1> LocalRange(WGSize);

SYCL/Reduction/reduction_nd_s1_rw.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2-
// RUNx: %HOST_RUN_PLACEHOLDER %t.out
32
// RUN: %CPU_RUN_PLACEHOLDER %t.out
43
// RUN: %GPU_RUN_PLACEHOLDER %t.out
54
// RUN: %ACC_RUN_PLACEHOLDER %t.out
65

6+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DTEST_SYCL2020_REDUCTIONS %s -o %t2020.out
7+
// RUN: %CPU_RUN_PLACEHOLDER %t2020.out
8+
// RUN: %GPU_RUN_PLACEHOLDER %t2020.out
9+
// RUN: %ACC_RUN_PLACEHOLDER %t2020.out
10+
711
// This test performs basic checks of parallel_for(nd_range, reduction, func)
812
// with reductions initialized with 1-dimensional read_write accessor
913
// accessing 1 element buffer.

SYCL/Reduction/reduction_placeholder.cpp

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,25 @@
1616

1717
using namespace cl::sycl;
1818

19+
template <typename... Ts> class KNGroup;
20+
1921
template <typename SpecializationKernelName, typename T, int Dim,
20-
class BinaryOperation>
21-
void test(T Identity, size_t WGSize, size_t NWItems) {
22+
class BinaryOperation, access::mode Mode>
23+
void testOneCase(T Identity, T Init, size_t WGSize, size_t NWItems) {
2224
// Initialize.
2325
T CorrectOut;
2426
BinaryOperation BOp;
2527

2628
buffer<T, 1> OutBuf(1);
2729
buffer<T, 1> InBuf(NWItems);
2830
initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
31+
if (Mode == access::mode::read_write)
32+
CorrectOut = BOp(CorrectOut, Init);
2933

30-
(OutBuf.template get_access<access::mode::write>())[0] = Identity;
34+
(OutBuf.template get_access<access::mode::write>())[0] = Init;
3135

32-
auto Out =
33-
accessor<T, Dim, access::mode::read_write, access::target::global_buffer,
34-
access::placeholder::true_t>(OutBuf);
36+
auto Out = accessor<T, Dim, Mode, access::target::global_buffer,
37+
access::placeholder::true_t>(OutBuf);
3538
// Compute.
3639
queue Q;
3740
Q.submit([&](handler &CGH) {
@@ -58,28 +61,34 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
5861
}
5962
}
6063

64+
template <typename KernelName, typename T, int Dim, class BinaryOperation>
65+
void test(T Identity, T Init, size_t WGSize, size_t NWItems) {
66+
testOneCase<KNGroup<KernelName, class RWCase>, T, Dim, BinaryOperation,
67+
access::mode::read_write>(Identity, Init, WGSize, NWItems);
68+
testOneCase<KNGroup<KernelName, class DWCase>, T, Dim, BinaryOperation,
69+
access::mode::discard_write>(Identity, Init, WGSize, NWItems);
70+
}
71+
6172
int main() {
6273
// fast atomics and fast reduce
63-
test<class KernelName_LpplYqDbNN, int, 1, ONEAPI::plus<int>>(0, 49, 49 * 5);
64-
test<class KernelName_FlDEESAfXYXiBZhnEDQ, int, 0, ONEAPI::plus<int>>(0, 8,
65-
8);
74+
test<class AtomicReduce1, int, 1, ONEAPI::plus<int>>(0, 77, 49, 49 * 5);
75+
test<class AtomicReduce2, int, 0, ONEAPI::plus<int>>(0, -77, 8, 8);
6676

6777
// fast atomics
68-
test<class KernelName_caKErpdwXzEsGGkr, int, 0, ONEAPI::bit_or<int>>(0, 7,
69-
7 * 3);
70-
test<class KernelName_YbnKY, int, 1, ONEAPI::bit_or<int>>(0, 4, 128);
78+
test<class Atomic1, int, 0, ONEAPI::bit_or<int>>(0, 233, 7, 7 * 3);
79+
test<class Atomic2, int, 1, ONEAPI::bit_or<int>>(0, 177, 4, 128);
7180

7281
// fast reduce
73-
test<class KernelName_JQuAndqdovQbAHmVLm, float, 1, ONEAPI::minimum<float>>(
74-
getMaximumFPValue<float>(), 5, 5 * 7);
75-
test<class KernelName_MBbbTWwSc, float, 0, ONEAPI::maximum<float>>(
76-
getMinimumFPValue<float>(), 4, 128);
82+
test<class Reduce1, float, 1, ONEAPI::minimum<float>>(
83+
getMaximumFPValue<float>(), -5.0, 5, 5 * 7);
84+
test<class Reduce2, float, 0, ONEAPI::maximum<float>>(
85+
getMinimumFPValue<float>(), -5.0, 4, 128);
7786

7887
// generic algorithm
79-
test<class KernelName_WpNdTbTtYt, int, 0, std::multiplies<int>>(1, 7, 7 * 5);
80-
test<class KernelName_yAwH, int, 1, std::multiplies<int>>(1, 8, 16);
81-
test<class KernelName_BNuHxeewzfXATi, CustomVec<short>, 0,
82-
CustomVecPlus<short>>(CustomVec<short>(0), 8, 8 * 3);
88+
test<class Generic1, int, 0, std::multiplies<int>>(1, 2, 7, 7 * 5);
89+
test<class Generic2, int, 1, std::multiplies<int>>(1, 3, 8, 16);
90+
test<class Generic13, CustomVec<short>, 0, CustomVecPlus<short>>(
91+
CustomVec<short>(0), CustomVec<short>(4), 8, 8 * 3);
8392

8493
std::cout << "Test passed\n";
8594
return 0;

SYCL/Reduction/reduction_queue_parallel_for.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,13 @@
33
// RUN: %ACC_RUN_PLACEHOLDER %t.out
44
// RUN: %CPU_RUN_PLACEHOLDER %t.out
55

6+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DTEST_SYCL2020_REDUCTIONS %s -o %t2020.out
7+
// RUN: %CPU_RUN_PLACEHOLDER %t2020.out
8+
// RUN: %GPU_RUN_PLACEHOLDER %t2020.out
9+
// RUN: %ACC_RUN_PLACEHOLDER %t2020.out
10+
611
// RUNx: %HOST_RUN_PLACEHOLDER %t.out
12+
// RUNx: %HOST_RUN_PLACEHOLDER %t2020.out
713
// TODO: Enable the test for HOST when it supports ONEAPI::reduce() and
814
// barrier()
915

@@ -25,10 +31,17 @@ int main() {
2531
int *Sum = malloc_shared<int>(1, Q);
2632
*Sum = 0;
2733

34+
#ifdef TEST_SYCL2020_REDUCTIONS
35+
Q.parallel_for<class XYZ>(
36+
nd_range<1>{NElems, WGSize}, sycl::reduction(Sum, std::plus<>()),
37+
[=](nd_item<1> It, auto &Sum) { Sum += Data[It.get_global_id(0)]; })
38+
.wait();
39+
#else
2840
Q.parallel_for<class XYZ>(
2941
nd_range<1>{NElems, WGSize}, ONEAPI::reduction(Sum, ONEAPI::plus<>()),
3042
[=](nd_item<1> It, auto &Sum) { Sum += Data[It.get_global_id(0)]; })
3143
.wait();
44+
#endif
3245

3346
int ExpectedSum = (NElems - 1) * NElems / 2;
3447
int Error = 0;

0 commit comments

Comments
 (0)