Skip to content

Commit 1025cd5

Browse files
authored
[SYCL] Add LIT test for parallel_for() accepting many reductions (intel/llvm-test-suite#131)
This test verifies the following change-set in compiler: intel#3123 Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent d2c96c9 commit 1025cd5

File tree

1 file changed

+208
-0
lines changed

1 file changed

+208
-0
lines changed
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
// TODO: level_zero reports an internal error for this test.
2+
// UNSUPPORTED: level_zero
3+
4+
// TODO: Windows implementation of std::tuple is not trivially copiable and
5+
// thus cannot be passed from HOST to DEVICE. Enable the test on Windows when
6+
// SYCL RT gets new type traits having less strict requirements for objects
7+
// being passed to DEVICE.
8+
// UNSUPPORTED: windows
9+
10+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
11+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
12+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
13+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
14+
15+
// This test checks handling of parallel_for() accepting nd_range and
16+
// two or more reductions.
17+
18+
#include "reduction_utils.hpp"
19+
20+
#include <CL/sycl.hpp>
21+
22+
#include <cassert>
23+
#include <cmath>
24+
#include <cstdint>
25+
#include <cstdio>
26+
#include <cstdlib>
27+
#include <numeric>
28+
#include <string>
29+
30+
template <typename... Ts> class KernelNameGroup;
31+
32+
using namespace cl::sycl;
33+
34+
template <typename T>
35+
bool cherkResultIsExpected(int TestCaseNum, T Expected, T Computed) {
36+
bool Success;
37+
if (!std::is_floating_point<T>::value)
38+
Success = (Expected == Computed);
39+
else
40+
Success = std::abs((Expected / Computed) - 1) < 0.5;
41+
42+
if (!Success)
43+
std::cout << TestCaseNum << ": Expected value = " << Expected
44+
<< ", Computed value = " << Computed << "\n";
45+
46+
return Success;
47+
}
48+
49+
template <class ReductionExample, typename T1, access::mode Mode1, typename T2,
50+
access::mode Mode2, typename T3, access::mode Mode3, typename T4,
51+
class BinaryOperation1, class BinaryOperation2,
52+
class BinaryOperation3, class BinaryOperation4>
53+
int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
54+
T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2,
55+
T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3,
56+
T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4,
57+
usm::alloc AllocType4, size_t NWorkItems, size_t WGSize) {
58+
buffer<T1, 1> InBuf1(NWorkItems);
59+
buffer<T2, 1> InBuf2(NWorkItems);
60+
buffer<T3, 1> InBuf3(NWorkItems);
61+
buffer<T4, 1> InBuf4(NWorkItems);
62+
buffer<T1, 1> OutBuf1(1);
63+
buffer<T2, 1> OutBuf2(1);
64+
buffer<T3, 1> OutBuf3(1);
65+
66+
queue Q;
67+
auto Dev = Q.get_device();
68+
if (AllocType4 == usm::alloc::shared &&
69+
!Dev.get_info<info::device::usm_shared_allocations>())
70+
return 4;
71+
if (AllocType4 == usm::alloc::host &&
72+
!Dev.get_info<info::device::usm_host_allocations>())
73+
return 4;
74+
if (AllocType4 == usm::alloc::device &&
75+
!Dev.get_info<info::device::usm_device_allocations>())
76+
return 4;
77+
T4 *Out4 = (T4 *)malloc(sizeof(T4), Dev, Q.get_context(), AllocType4);
78+
if (Out4 == nullptr)
79+
return 4;
80+
81+
// Initialize the arrays with sentinel values
82+
// and pre-compute the expected result 'CorrectOut'.
83+
T1 CorrectOut1;
84+
T2 CorrectOut2;
85+
T3 CorrectOut3;
86+
T4 CorrectOut4;
87+
initInputData(InBuf1, CorrectOut1, IdentityVal1, BOp1, NWorkItems);
88+
initInputData(InBuf2, CorrectOut2, IdentityVal2, BOp2, NWorkItems);
89+
initInputData(InBuf3, CorrectOut3, IdentityVal3, BOp3, NWorkItems);
90+
initInputData(InBuf4, CorrectOut4, IdentityVal4, BOp4, NWorkItems);
91+
92+
if (Mode1 == access::mode::read_write)
93+
CorrectOut1 = BOp1(CorrectOut1, InitVal1);
94+
if (Mode2 == access::mode::read_write)
95+
CorrectOut2 = BOp2(CorrectOut2, InitVal2);
96+
if (Mode3 == access::mode::read_write)
97+
CorrectOut3 = BOp3(CorrectOut3, InitVal3);
98+
// 4th reduction is USM and this is read_write.
99+
CorrectOut4 = BOp4(CorrectOut4, InitVal4);
100+
101+
// Inititialize data.
102+
{
103+
auto Out1 = OutBuf1.template get_access<access::mode::write>();
104+
Out1[0] = InitVal1;
105+
auto Out2 = OutBuf2.template get_access<access::mode::write>();
106+
Out2[0] = InitVal2;
107+
auto Out3 = OutBuf3.template get_access<access::mode::write>();
108+
Out3[0] = InitVal3;
109+
110+
if (AllocType4 == usm::alloc::device) {
111+
Q.submit([&](handler &CGH) {
112+
CGH.single_task<
113+
KernelNameGroup<ReductionExample, class KernelNameUSM4>>(
114+
[=]() { *Out4 = InitVal4; });
115+
}).wait();
116+
} else {
117+
*Out4 = InitVal4;
118+
}
119+
}
120+
121+
// The main code to be tested.
122+
Q.submit([&](handler &CGH) {
123+
auto In1 = InBuf1.template get_access<access::mode::read>(CGH);
124+
auto In2 = InBuf2.template get_access<access::mode::read>(CGH);
125+
auto In3 = InBuf3.template get_access<access::mode::read>(CGH);
126+
auto In4 = InBuf4.template get_access<access::mode::read>(CGH);
127+
128+
auto Out1 = OutBuf1.template get_access<Mode1>(CGH);
129+
auto Out2 = OutBuf2.template get_access<Mode2>(CGH);
130+
accessor<T3, 0, Mode3, access::target::global_buffer> Out3(OutBuf3, CGH);
131+
132+
auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
133+
auto &Sum4) {
134+
size_t I = NDIt.get_global_id(0);
135+
Sum1.combine(In1[I]);
136+
Sum2.combine(In2[I]);
137+
Sum3.combine(In3[I]);
138+
Sum4.combine(In4[I]);
139+
};
140+
141+
auto Redu1 =
142+
ONEAPI::reduction<T1, BinaryOperation1>(Out1, IdentityVal1, BOp1);
143+
auto Redu2 =
144+
ONEAPI::reduction<T2, BinaryOperation2>(Out2, IdentityVal2, BOp2);
145+
auto Redu3 =
146+
ONEAPI::reduction<T3, BinaryOperation3>(Out3, IdentityVal3, BOp3);
147+
auto Redu4 =
148+
ONEAPI::reduction<T4, BinaryOperation4>(Out4, IdentityVal4, BOp4);
149+
150+
auto NDR = nd_range<1>{range<1>(NWorkItems), range<1>{WGSize}};
151+
CGH.parallel_for<ReductionExample>(NDR, Redu1, Redu2, Redu3, Redu4,
152+
Lambda);
153+
}).wait();
154+
155+
// Check the results and free memory.
156+
int Error = 0;
157+
{
158+
auto Out1 = OutBuf1.template get_access<access::mode::read>();
159+
auto Out2 = OutBuf2.template get_access<access::mode::read>();
160+
auto Out3 = OutBuf3.template get_access<access::mode::read>();
161+
162+
T4 Out4Val;
163+
if (AllocType4 == usm::alloc::device) {
164+
buffer<T4, 1> Buf(&Out4Val, range<1>(1));
165+
Q.submit([&](handler &CGH) {
166+
auto OutAcc = Buf.template get_access<access::mode::discard_write>(CGH);
167+
CGH.copy(Out4, OutAcc);
168+
});
169+
Out4Val = (Buf.template get_access<access::mode::read>())[0];
170+
} else {
171+
Out4Val = *Out4;
172+
}
173+
174+
Error += cherkResultIsExpected(1, CorrectOut1, Out1[0]) ? 0 : 1;
175+
Error += cherkResultIsExpected(2, CorrectOut2, Out2[0]) ? 0 : 1;
176+
Error += cherkResultIsExpected(3, CorrectOut3, Out3[0]) ? 0 : 1;
177+
Error += cherkResultIsExpected(4, CorrectOut4, Out4Val) ? 0 : 1;
178+
free(Out4, Q.get_context());
179+
}
180+
181+
if (Error)
182+
std::cerr << "The test failed for nd_range(" << NWorkItems << "," << WGSize
183+
<< ")\n\n";
184+
185+
return Error;
186+
}
187+
188+
int main() {
189+
int Error =
190+
runTest<class ReduFloatPlus16x1, float, access::mode::discard_write, int,
191+
access::mode::read_write, short, access::mode::read_write, int>(
192+
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000,
193+
std::bit_or<>{}, 0, 8000, std::bit_xor<>{}, usm::alloc::shared, 16,
194+
16);
195+
196+
auto Add = [](auto x, auto y) { return (x + y); };
197+
Error += runTest<class ReduFloatPlus5x257, float, access::mode::read_write,
198+
int, access::mode::read_write, short,
199+
access::mode::discard_write, int>(
200+
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000, Add, 0,
201+
8000, std::bit_xor<int>{}, usm::alloc::device, 5 * (256 + 1), 5);
202+
203+
if (!Error)
204+
std::cout << "Test passed\n";
205+
else
206+
std::cout << Error << " test-cases failed\n";
207+
return Error;
208+
}

0 commit comments

Comments
 (0)