12
12
// RUN: %GPU_RUN_PLACEHOLDER %t.out
13
13
// RUN: %ACC_RUN_PLACEHOLDER %t.out
14
14
15
- // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DTEST_SYCL2020_REDUCTIONS %s -o %t2020.out
16
- // RUN: %CPU_RUN_PLACEHOLDER %t2020.out
17
- // RUN: %GPU_RUN_PLACEHOLDER %t2020.out
18
- // RUN: %ACC_RUN_PLACEHOLDER %t2020.out
19
-
20
15
// This test checks handling of parallel_for() accepting nd_range and
21
16
// two or more reductions.
22
17
32
27
#include < numeric>
33
28
#include < string>
34
29
35
- template <typename ... Ts> class KernelNameGroup ;
36
-
37
30
using namespace cl ::sycl;
38
31
32
+ template <typename ... Ts> class KNameGroup ;
33
+ template <typename T, bool B> class KName ;
34
+
35
+ constexpr access::mode RW = access::mode::read_write;
36
+ constexpr access::mode DW = access::mode::discard_write;
37
+
39
38
template <typename T>
40
39
bool cherkResultIsExpected (int TestCaseNum, T Expected, T Computed) {
41
40
bool Success;
@@ -51,11 +50,12 @@ bool cherkResultIsExpected(int TestCaseNum, T Expected, T Computed) {
51
50
return Success;
52
51
}
53
52
54
- template <class ReductionExample , typename T1, access::mode Mode1, typename T2,
55
- access::mode Mode2, typename T3, access::mode Mode3, typename T4,
56
- class BinaryOperation1 , class BinaryOperation2 ,
53
+ // Returns 0 if the test case passed. Otherwise, some non-zero value.
54
+ template <class Name , bool IsSYCL2020Mode, typename T1, access::mode Mode1,
55
+ typename T2, access::mode Mode2, typename T3, access::mode Mode3,
56
+ typename T4, class BinaryOperation1 , class BinaryOperation2 ,
57
57
class BinaryOperation3 , class BinaryOperation4 >
58
- int runTest (T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
58
+ int testOne (T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
59
59
T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2,
60
60
T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3,
61
61
T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4,
@@ -72,16 +72,16 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
72
72
auto Dev = Q.get_device ();
73
73
if (AllocType4 == usm::alloc::shared &&
74
74
!Dev.get_info <info::device::usm_shared_allocations>())
75
- return 4 ;
75
+ return 0 ;
76
76
if (AllocType4 == usm::alloc::host &&
77
77
!Dev.get_info <info::device::usm_host_allocations>())
78
- return 4 ;
78
+ return 0 ;
79
79
if (AllocType4 == usm::alloc::device &&
80
80
!Dev.get_info <info::device::usm_device_allocations>())
81
- return 4 ;
81
+ return 0 ;
82
82
T4 *Out4 = (T4 *)malloc (sizeof (T4), Dev, Q.get_context (), AllocType4);
83
83
if (Out4 == nullptr )
84
- return 4 ;
84
+ return 1 ;
85
85
86
86
// Initialize the arrays with sentinel values
87
87
// and pre-compute the expected result 'CorrectOut'.
@@ -114,51 +114,65 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
114
114
115
115
if (AllocType4 == usm::alloc::device) {
116
116
Q.submit ([&](handler &CGH) {
117
- CGH.single_task <
118
- KernelNameGroup<ReductionExample, class KernelNameUSM4 >>(
117
+ CGH.single_task <KNameGroup<Name, class KernelNameUSM4 >>(
119
118
[=]() { *Out4 = InitVal4; });
120
119
}).wait ();
121
120
} else {
122
121
*Out4 = InitVal4;
123
122
}
124
123
}
125
124
126
- // The main code to be tested.
127
- Q.submit ([&](handler &CGH) {
128
- auto In1 = InBuf1.template get_access <access::mode::read>(CGH);
129
- auto In2 = InBuf2.template get_access <access::mode::read>(CGH);
130
- auto In3 = InBuf3.template get_access <access::mode::read>(CGH);
131
- auto In4 = InBuf4.template get_access <access::mode::read>(CGH);
132
-
133
- #ifdef TEST_SYCL2020_REDUCTIONS
134
- auto Redu1 = sycl::reduction (OutBuf1, CGH, IdentityVal1, BOp1);
135
- auto Redu2 = sycl::reduction (OutBuf2, CGH, IdentityVal2, BOp2);
136
- auto Redu3 = sycl::reduction (OutBuf3, CGH, IdentityVal3, BOp3);
137
- auto Redu4 = sycl::reduction (Out4, IdentityVal4, BOp4);
138
- #else
139
- auto Out1 = OutBuf1.template get_access <Mode1>(CGH);
140
- auto Out2 = OutBuf2.template get_access <Mode2>(CGH);
141
- accessor<T3, 0 , Mode3, access::target::global_buffer> Out3 (OutBuf3, CGH);
142
-
143
- auto Redu1 = ONEAPI::reduction (Out1, IdentityVal1, BOp1);
144
- auto Redu2 = ONEAPI::reduction (Out2, IdentityVal2, BOp2);
145
- auto Redu3 = ONEAPI::reduction (Out3, IdentityVal3, BOp3);
146
- auto Redu4 = ONEAPI::reduction (Out4, IdentityVal4, BOp4);
147
- #endif
148
-
149
- auto Lambda = [=](nd_item<1 > NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
150
- auto &Sum4) {
151
- size_t I = NDIt.get_global_id (0 );
152
- Sum1.combine (In1[I]);
153
- Sum2.combine (In2[I]);
154
- Sum3.combine (In3[I]);
155
- Sum4.combine (In4[I]);
156
- };
157
-
158
- auto NDR = nd_range<1 >{range<1 >(NWorkItems), range<1 >{WGSize}};
159
- CGH.parallel_for <ReductionExample>(NDR, Redu1, Redu2, Redu3, Redu4,
160
- Lambda);
161
- }).wait ();
125
+ auto NDR = nd_range<1 >{range<1 >(NWorkItems), range<1 >{WGSize}};
126
+ if constexpr (IsSYCL2020Mode) {
127
+ Q.submit ([&](handler &CGH) {
128
+ auto In1 = InBuf1.template get_access <access::mode::read>(CGH);
129
+ auto In2 = InBuf2.template get_access <access::mode::read>(CGH);
130
+ auto In3 = InBuf3.template get_access <access::mode::read>(CGH);
131
+ auto In4 = InBuf4.template get_access <access::mode::read>(CGH);
132
+
133
+ auto Redu1 = sycl::reduction (OutBuf1, CGH, IdentityVal1, BOp1);
134
+ auto Redu2 = sycl::reduction (OutBuf2, CGH, IdentityVal2, BOp2);
135
+ auto Redu3 = sycl::reduction (OutBuf3, CGH, IdentityVal3, BOp3);
136
+ auto Redu4 = sycl::reduction (Out4, IdentityVal4, BOp4);
137
+
138
+ auto Lambda = [=](nd_item<1 > NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
139
+ auto &Sum4) {
140
+ size_t I = NDIt.get_global_id (0 );
141
+ Sum1.combine (In1[I]);
142
+ Sum2.combine (In2[I]);
143
+ Sum3.combine (In3[I]);
144
+ Sum4.combine (In4[I]);
145
+ };
146
+ CGH.parallel_for <Name>(NDR, Redu1, Redu2, Redu3, Redu4, Lambda);
147
+ }).wait ();
148
+ } else {
149
+ // Test ONEAPI reductions
150
+ Q.submit ([&](handler &CGH) {
151
+ auto In1 = InBuf1.template get_access <access::mode::read>(CGH);
152
+ auto In2 = InBuf2.template get_access <access::mode::read>(CGH);
153
+ auto In3 = InBuf3.template get_access <access::mode::read>(CGH);
154
+ auto In4 = InBuf4.template get_access <access::mode::read>(CGH);
155
+
156
+ auto Out1 = OutBuf1.template get_access <Mode1>(CGH);
157
+ auto Out2 = OutBuf2.template get_access <Mode2>(CGH);
158
+ accessor<T3, 0 , Mode3, access::target::global_buffer> Out3 (OutBuf3, CGH);
159
+
160
+ auto Redu1 = ONEAPI::reduction (Out1, IdentityVal1, BOp1);
161
+ auto Redu2 = ONEAPI::reduction (Out2, IdentityVal2, BOp2);
162
+ auto Redu3 = ONEAPI::reduction (Out3, IdentityVal3, BOp3);
163
+ auto Redu4 = ONEAPI::reduction (Out4, IdentityVal4, BOp4);
164
+
165
+ auto Lambda = [=](nd_item<1 > NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
166
+ auto &Sum4) {
167
+ size_t I = NDIt.get_global_id (0 );
168
+ Sum1.combine (In1[I]);
169
+ Sum2.combine (In2[I]);
170
+ Sum3.combine (In3[I]);
171
+ Sum4.combine (In4[I]);
172
+ };
173
+ CGH.parallel_for <Name>(NDR, Redu1, Redu2, Redu3, Redu4, Lambda);
174
+ }).wait ();
175
+ }
162
176
163
177
// Check the results and free memory.
164
178
int Error = 0 ;
@@ -193,24 +207,43 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
193
207
return Error;
194
208
}
195
209
196
- int main () {
197
- constexpr access::mode ReadWriteMode = access::mode::read_write;
198
- #ifdef TEST_SYCL2020_REDUCTIONS
210
+ // Tests both implementations of reduction:
211
+ // sycl::reduction and sycl::ONEAPI::reduction
212
+ template <class Name , typename T1, access::mode Mode1, typename T2,
213
+ access::mode Mode2, typename T3, access::mode Mode3, typename T4,
214
+ class BinaryOperation1 , class BinaryOperation2 ,
215
+ class BinaryOperation3 , class BinaryOperation4 >
216
+ int testBoth (T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
217
+ T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2,
218
+ T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3,
219
+ T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4,
220
+ usm::alloc AllocType4, size_t NWorkItems, size_t WGSize) {
221
+ int Error =
222
+ testOne<KName<Name, false >, false , T1, Mode1, T2, Mode2, T3, Mode3, T4>(
223
+ IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2, BOp2,
224
+ IdentityVal3, InitVal3, BOp3, IdentityVal4, InitVal4, BOp4,
225
+ AllocType4, NWorkItems, WGSize);
226
+
199
227
// TODO: property::reduction::initialize_to_identity is not supported yet.
200
228
// Thus only read_write mode is tested now.
201
- constexpr access::mode DiscardWriteMode = access::mode::read_write;
202
- #else
203
- constexpr access::mode DiscardWriteMode = access::mode::discard_write;
204
- #endif
229
+ constexpr access::mode _Mode1 = (Mode1 == DW) ? RW : Mode1;
230
+ constexpr access::mode _Mode2 = (Mode2 == DW) ? RW : Mode2;
231
+ constexpr access::mode _Mode3 = (Mode3 == DW) ? RW : Mode3;
232
+ Error +=
233
+ testOne<KName<Name, true >, true , T1, _Mode1, T2, _Mode2, T3, _Mode3, T4>(
234
+ IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2, BOp2,
235
+ IdentityVal3, InitVal3, BOp3, IdentityVal4, InitVal4, BOp4,
236
+ AllocType4, NWorkItems, WGSize);
237
+ return Error;
238
+ }
205
239
206
- int Error = runTest< class ReduFloatPlus16x1 , float , DiscardWriteMode, int ,
207
- ReadWriteMode , short , ReadWriteMode , int >(
240
+ int main () {
241
+ int Error = testBoth< class FP32Plus16x16 , float , DW, int , RW , short , RW , int >(
208
242
0 , 1000 , std::plus<float >{}, 0 , 2000 , std::plus<>{}, 0 , 4000 ,
209
243
std::bit_or<>{}, 0 , 8000 , std::bit_xor<>{}, usm::alloc::shared, 16 , 16 );
210
244
211
245
auto Add = [](auto x, auto y) { return (x + y); };
212
- Error += runTest<class ReduFloatPlus5x257 , float , ReadWriteMode, int ,
213
- ReadWriteMode, short , DiscardWriteMode, int >(
246
+ Error += testBoth<class FP32Plus5x257 , float , RW, int , RW, short , DW, int >(
214
247
0 , 1000 , std::plus<float >{}, 0 , 2000 , std::plus<>{}, 0 , 4000 , Add, 0 ,
215
248
8000 , std::bit_xor<int >{}, usm::alloc::device, 5 * (256 + 1 ), 5 );
216
249
0 commit comments