27
27
#include < numeric>
28
28
#include < string>
29
29
30
- template <typename ... Ts> class KernelNameGroup ;
31
-
32
30
using namespace cl ::sycl;
33
31
32
+ template <typename ... Ts> class KNameGroup ;
33
+ template <typename T, bool B> class KName ;
34
+
35
+ constexpr access::mode RW = access::mode::read_write;
36
+ constexpr access::mode DW = access::mode::discard_write;
37
+
34
38
template <typename T>
35
39
bool cherkResultIsExpected (int TestCaseNum, T Expected, T Computed) {
36
40
bool Success;
@@ -46,11 +50,12 @@ bool cherkResultIsExpected(int TestCaseNum, T Expected, T Computed) {
46
50
return Success;
47
51
}
48
52
49
- template <class ReductionExample , typename T1, access::mode Mode1, typename T2,
50
- access::mode Mode2, typename T3, access::mode Mode3, typename T4,
51
- class BinaryOperation1 , class BinaryOperation2 ,
53
+ // Returns 0 if the test case passed. Otherwise, some non-zero value.
54
+ template <class Name , bool IsSYCL2020Mode, typename T1, access::mode Mode1,
55
+ typename T2, access::mode Mode2, typename T3, access::mode Mode3,
56
+ typename T4, class BinaryOperation1 , class BinaryOperation2 ,
52
57
class BinaryOperation3 , class BinaryOperation4 >
53
- int runTest (T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
58
+ int testOne (T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
54
59
T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2,
55
60
T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3,
56
61
T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4,
@@ -67,16 +72,16 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
67
72
auto Dev = Q.get_device ();
68
73
if (AllocType4 == usm::alloc::shared &&
69
74
!Dev.get_info <info::device::usm_shared_allocations>())
70
- return 4 ;
75
+ return 0 ;
71
76
if (AllocType4 == usm::alloc::host &&
72
77
!Dev.get_info <info::device::usm_host_allocations>())
73
- return 4 ;
78
+ return 0 ;
74
79
if (AllocType4 == usm::alloc::device &&
75
80
!Dev.get_info <info::device::usm_device_allocations>())
76
- return 4 ;
81
+ return 0 ;
77
82
T4 *Out4 = (T4 *)malloc (sizeof (T4), Dev, Q.get_context (), AllocType4);
78
83
if (Out4 == nullptr )
79
- return 4 ;
84
+ return 1 ;
80
85
81
86
// Initialize the arrays with sentinel values
82
87
// and pre-compute the expected result 'CorrectOut'.
@@ -109,48 +114,65 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
109
114
110
115
if (AllocType4 == usm::alloc::device) {
111
116
Q.submit ([&](handler &CGH) {
112
- CGH.single_task <
113
- KernelNameGroup<ReductionExample, class KernelNameUSM4 >>(
117
+ CGH.single_task <KNameGroup<Name, class KernelNameUSM4 >>(
114
118
[=]() { *Out4 = InitVal4; });
115
119
}).wait ();
116
120
} else {
117
121
*Out4 = InitVal4;
118
122
}
119
123
}
120
124
121
- // The main code to be tested.
122
- Q.submit ([&](handler &CGH) {
123
- auto In1 = InBuf1.template get_access <access::mode::read>(CGH);
124
- auto In2 = InBuf2.template get_access <access::mode::read>(CGH);
125
- auto In3 = InBuf3.template get_access <access::mode::read>(CGH);
126
- auto In4 = InBuf4.template get_access <access::mode::read>(CGH);
127
-
128
- auto Out1 = OutBuf1.template get_access <Mode1>(CGH);
129
- auto Out2 = OutBuf2.template get_access <Mode2>(CGH);
130
- accessor<T3, 0 , Mode3, access::target::global_buffer> Out3 (OutBuf3, CGH);
131
-
132
- auto Lambda = [=](nd_item<1 > NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
133
- auto &Sum4) {
134
- size_t I = NDIt.get_global_id (0 );
135
- Sum1.combine (In1[I]);
136
- Sum2.combine (In2[I]);
137
- Sum3.combine (In3[I]);
138
- Sum4.combine (In4[I]);
139
- };
140
-
141
- auto Redu1 =
142
- ONEAPI::reduction<T1, BinaryOperation1>(Out1, IdentityVal1, BOp1);
143
- auto Redu2 =
144
- ONEAPI::reduction<T2, BinaryOperation2>(Out2, IdentityVal2, BOp2);
145
- auto Redu3 =
146
- ONEAPI::reduction<T3, BinaryOperation3>(Out3, IdentityVal3, BOp3);
147
- auto Redu4 =
148
- ONEAPI::reduction<T4, BinaryOperation4>(Out4, IdentityVal4, BOp4);
149
-
150
- auto NDR = nd_range<1 >{range<1 >(NWorkItems), range<1 >{WGSize}};
151
- CGH.parallel_for <ReductionExample>(NDR, Redu1, Redu2, Redu3, Redu4,
152
- Lambda);
153
- }).wait ();
125
+ auto NDR = nd_range<1 >{range<1 >(NWorkItems), range<1 >{WGSize}};
126
+ if constexpr (IsSYCL2020Mode) {
127
+ Q.submit ([&](handler &CGH) {
128
+ auto In1 = InBuf1.template get_access <access::mode::read>(CGH);
129
+ auto In2 = InBuf2.template get_access <access::mode::read>(CGH);
130
+ auto In3 = InBuf3.template get_access <access::mode::read>(CGH);
131
+ auto In4 = InBuf4.template get_access <access::mode::read>(CGH);
132
+
133
+ auto Redu1 = sycl::reduction (OutBuf1, CGH, IdentityVal1, BOp1);
134
+ auto Redu2 = sycl::reduction (OutBuf2, CGH, IdentityVal2, BOp2);
135
+ auto Redu3 = sycl::reduction (OutBuf3, CGH, IdentityVal3, BOp3);
136
+ auto Redu4 = sycl::reduction (Out4, IdentityVal4, BOp4);
137
+
138
+ auto Lambda = [=](nd_item<1 > NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
139
+ auto &Sum4) {
140
+ size_t I = NDIt.get_global_id (0 );
141
+ Sum1.combine (In1[I]);
142
+ Sum2.combine (In2[I]);
143
+ Sum3.combine (In3[I]);
144
+ Sum4.combine (In4[I]);
145
+ };
146
+ CGH.parallel_for <Name>(NDR, Redu1, Redu2, Redu3, Redu4, Lambda);
147
+ }).wait ();
148
+ } else {
149
+ // Test ONEAPI reductions
150
+ Q.submit ([&](handler &CGH) {
151
+ auto In1 = InBuf1.template get_access <access::mode::read>(CGH);
152
+ auto In2 = InBuf2.template get_access <access::mode::read>(CGH);
153
+ auto In3 = InBuf3.template get_access <access::mode::read>(CGH);
154
+ auto In4 = InBuf4.template get_access <access::mode::read>(CGH);
155
+
156
+ auto Out1 = OutBuf1.template get_access <Mode1>(CGH);
157
+ auto Out2 = OutBuf2.template get_access <Mode2>(CGH);
158
+ accessor<T3, 0 , Mode3, access::target::global_buffer> Out3 (OutBuf3, CGH);
159
+
160
+ auto Redu1 = ONEAPI::reduction (Out1, IdentityVal1, BOp1);
161
+ auto Redu2 = ONEAPI::reduction (Out2, IdentityVal2, BOp2);
162
+ auto Redu3 = ONEAPI::reduction (Out3, IdentityVal3, BOp3);
163
+ auto Redu4 = ONEAPI::reduction (Out4, IdentityVal4, BOp4);
164
+
165
+ auto Lambda = [=](nd_item<1 > NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
166
+ auto &Sum4) {
167
+ size_t I = NDIt.get_global_id (0 );
168
+ Sum1.combine (In1[I]);
169
+ Sum2.combine (In2[I]);
170
+ Sum3.combine (In3[I]);
171
+ Sum4.combine (In4[I]);
172
+ };
173
+ CGH.parallel_for <Name>(NDR, Redu1, Redu2, Redu3, Redu4, Lambda);
174
+ }).wait ();
175
+ }
154
176
155
177
// Check the results and free memory.
156
178
int Error = 0 ;
@@ -185,18 +207,43 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
185
207
return Error;
186
208
}
187
209
188
- int main () {
210
+ // Tests both implementations of reduction:
211
+ // sycl::reduction and sycl::ONEAPI::reduction
212
+ template <class Name , typename T1, access::mode Mode1, typename T2,
213
+ access::mode Mode2, typename T3, access::mode Mode3, typename T4,
214
+ class BinaryOperation1 , class BinaryOperation2 ,
215
+ class BinaryOperation3 , class BinaryOperation4 >
216
+ int testBoth (T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
217
+ T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2,
218
+ T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3,
219
+ T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4,
220
+ usm::alloc AllocType4, size_t NWorkItems, size_t WGSize) {
189
221
int Error =
190
- runTest<class ReduFloatPlus16x1 , float , access::mode::discard_write, int ,
191
- access::mode::read_write, short , access::mode::read_write, int >(
192
- 0 , 1000 , std::plus<float >{}, 0 , 2000 , std::plus<>{}, 0 , 4000 ,
193
- std::bit_or<>{}, 0 , 8000 , std::bit_xor<>{}, usm::alloc::shared, 16 ,
194
- 16 );
222
+ testOne<KName<Name, false >, false , T1, Mode1, T2, Mode2, T3, Mode3, T4>(
223
+ IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2, BOp2,
224
+ IdentityVal3, InitVal3, BOp3, IdentityVal4, InitVal4, BOp4,
225
+ AllocType4, NWorkItems, WGSize);
226
+
227
+ // TODO: property::reduction::initialize_to_identity is not supported yet.
228
+ // Thus only read_write mode is tested now.
229
+ constexpr access::mode _Mode1 = (Mode1 == DW) ? RW : Mode1;
230
+ constexpr access::mode _Mode2 = (Mode2 == DW) ? RW : Mode2;
231
+ constexpr access::mode _Mode3 = (Mode3 == DW) ? RW : Mode3;
232
+ Error +=
233
+ testOne<KName<Name, true >, true , T1, _Mode1, T2, _Mode2, T3, _Mode3, T4>(
234
+ IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2, BOp2,
235
+ IdentityVal3, InitVal3, BOp3, IdentityVal4, InitVal4, BOp4,
236
+ AllocType4, NWorkItems, WGSize);
237
+ return Error;
238
+ }
239
+
240
+ int main () {
241
+ int Error = testBoth<class FP32Plus16x16 , float , DW, int , RW, short , RW, int >(
242
+ 0 , 1000 , std::plus<float >{}, 0 , 2000 , std::plus<>{}, 0 , 4000 ,
243
+ std::bit_or<>{}, 0 , 8000 , std::bit_xor<>{}, usm::alloc::shared, 16 , 16 );
195
244
196
245
auto Add = [](auto x, auto y) { return (x + y); };
197
- Error += runTest<class ReduFloatPlus5x257 , float , access::mode::read_write,
198
- int , access::mode::read_write, short ,
199
- access::mode::discard_write, int >(
246
+ Error += testBoth<class FP32Plus5x257 , float , RW, int , RW, short , DW, int >(
200
247
0 , 1000 , std::plus<float >{}, 0 , 2000 , std::plus<>{}, 0 , 4000 , Add, 0 ,
201
248
8000 , std::bit_xor<int >{}, usm::alloc::device, 5 * (256 + 1 ), 5 );
202
249
0 commit comments