@@ -162,6 +162,126 @@ struct slice_area {
162
162
};
163
163
};
164
164
165
+
166
+ // a way to get value_type from both accessors and USM that is needed for transform_init
167
+ template <typename Unknown>
168
+ struct accessor_traits
169
+ {
170
+ };
171
+
172
+ template <typename T, int Dim, sycl::access::mode AccMode, sycl::access::target AccTarget,
173
+ sycl::access::placeholder Placeholder>
174
+ struct accessor_traits <sycl::accessor<T, Dim, AccMode, AccTarget, Placeholder>>
175
+ {
176
+ using value_type = typename sycl::accessor<T, Dim, AccMode, AccTarget, Placeholder>::value_type;
177
+ };
178
+
179
+ template <typename RawArrayValueType>
180
+ struct accessor_traits <RawArrayValueType*>
181
+ {
182
+ using value_type = RawArrayValueType;
183
+ };
184
+
185
+ // calculate shift where we should start processing on current item
186
+ template <typename NDItemId, typename GlobalIdx, typename SizeNIter, typename SizeN>
187
+ SizeN
188
+ calc_shift (const NDItemId item_id, const GlobalIdx global_idx, SizeNIter& n_iter, const SizeN n)
189
+ {
190
+ auto global_range_size = item_id.get_global_range ().size ();
191
+
192
+ auto start = n_iter * global_idx;
193
+ auto global_shift = global_idx + n_iter * global_range_size;
194
+ if (n_iter > 0 && global_shift > n)
195
+ {
196
+ start += n % global_range_size - global_idx;
197
+ }
198
+ else if (global_shift < n)
199
+ {
200
+ n_iter++;
201
+ }
202
+ return start;
203
+ }
204
+
205
+
206
+ template <typename ExecutionPolicy, typename Operation1, typename Operation2>
207
+ struct transform_init
208
+ {
209
+ Operation1 binary_op;
210
+ Operation2 unary_op;
211
+
212
+ template <typename NDItemId, typename GlobalIdx, typename Size, typename AccLocal, typename ... Acc>
213
+ void
214
+ operator ()(const NDItemId item_id, const GlobalIdx global_idx, Size n, AccLocal& local_mem,
215
+ const Acc&... acc)
216
+ {
217
+ auto local_idx = item_id.get_local_id (0 );
218
+ auto global_range_size = item_id.get_global_range ().size ();
219
+ auto n_iter = n / global_range_size;
220
+ auto start = calc_shift (item_id, global_idx, n_iter, n);
221
+ auto shifted_global_idx = global_idx + start;
222
+
223
+ typename accessor_traits<AccLocal>::value_type res;
224
+ if (global_idx < n)
225
+ {
226
+ res = unary_op (shifted_global_idx, acc...);
227
+ }
228
+ // Add neighbour to the current local_mem
229
+ for (decltype (n_iter) i = 1 ; i < n_iter; ++i)
230
+ {
231
+ res = binary_op (res, unary_op (shifted_global_idx + i, acc...));
232
+ }
233
+ if (global_idx < n)
234
+ {
235
+ local_mem[local_idx] = res;
236
+ }
237
+ }
238
+ };
239
+
240
+
241
+ // Reduce on local memory
242
+ template <typename ExecutionPolicy, typename BinaryOperation1, typename Tp>
243
+ struct reduce
244
+ {
245
+ BinaryOperation1 bin_op1;
246
+
247
+ template <typename NDItemId, typename GlobalIdx, typename Size, typename AccLocal>
248
+ Tp
249
+ operator ()(const NDItemId item_id, const GlobalIdx global_idx, const Size n, AccLocal& local_mem)
250
+ {
251
+ auto local_idx = item_id.get_local_id (0 );
252
+ auto group_size = item_id.get_local_range ().size ();
253
+
254
+ auto k = 1 ;
255
+ do
256
+ {
257
+ item_id.barrier (sycl::access::fence_space::local_space);
258
+ if (local_idx % (2 * k) == 0 && local_idx + k < group_size && global_idx < n &&
259
+ global_idx + k < n)
260
+ {
261
+ local_mem[local_idx] = bin_op1 (local_mem[local_idx], local_mem[local_idx + k]);
262
+ }
263
+ k *= 2 ;
264
+ } while (k < group_size);
265
+ return local_mem[local_idx];
266
+ }
267
+ };
268
+
269
+
270
+ // walk through the data
271
+ template <typename ExecutionPolicy, typename F>
272
+ struct walk_n
273
+ {
274
+ F f;
275
+
276
+ template <typename ItemId, typename ... Ranges>
277
+ auto
278
+ operator ()(const ItemId idx, Ranges&&... rngs) -> decltype (f(rngs[idx]...))
279
+ {
280
+ return f (rngs[idx]...);
281
+ }
282
+ };
283
+
284
+
165
285
// This option uses a parallel for to fill the buffer and then
166
286
// uses a tranform_init with plus/no_op and then
167
287
// a local reduction then global reduction.
@@ -187,21 +307,18 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) {
187
307
auto calc_begin = oneapi::dpl::begin (buf);
188
308
auto calc_end = oneapi::dpl::end (buf);
189
309
190
- using Functor = oneapi::dpl::unseq_backend:: walk_n<Policy, my_no_op>;
310
+ using Functor = walk_n<Policy, my_no_op>;
191
311
float result;
192
312
193
313
// Functor will do nothing for tranform_init and will use plus for reduce.
194
314
// In this example we have done the calculation and filled the buffer above
195
315
// The way transform_init works is that you need to have the value already
196
316
// populated in the buffer.
197
- auto tf_init =
198
- oneapi::dpl::unseq_backend::transform_init<Policy, std::plus<float >,
199
- Functor>{std::plus<float >(),
200
- Functor{my_no_op ()}};
317
+ auto tf_init = transform_init<Policy, std::plus<float >,
318
+ Functor>{std::plus<float >(), Functor{my_no_op ()}};
201
319
202
320
auto combine = std::plus<float >();
203
- auto brick_reduce =
204
- oneapi::dpl::unseq_backend::reduce<Policy, std::plus<float >, float >{
321
+ auto brick_reduce = reduce<Policy, std::plus<float >, float >{
205
322
std::plus<float >()};
206
323
auto workgroup_size =
207
324
policy.queue ()
@@ -293,19 +410,17 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) {
293
410
auto calc_begin = oneapi::dpl::begin (buf2);
294
411
auto calc_end = oneapi::dpl::end (buf2);
295
412
296
- using Functor2 = oneapi::dpl::unseq_backend:: walk_n<Policy, slice_area>;
413
+ using Functor2 = walk_n<Policy, slice_area>;
297
414
298
415
// The buffer has 1...num it at and now we will use that as an input
299
416
// to the slice structue which will calculate the area of each
300
417
// rectangle.
301
- auto tf_init =
302
- oneapi::dpl::unseq_backend::transform_init<Policy, std::plus<float >,
418
+ auto tf_init = transform_init<Policy, std::plus<float >,
303
419
Functor2>{
304
420
std::plus<float >(), Functor2{slice_area (num_steps)}};
305
421
306
422
auto combine = std::plus<float >();
307
- auto brick_reduce =
308
- oneapi::dpl::unseq_backend::reduce<Policy, std::plus<float >, float >{
423
+ auto brick_reduce = reduce<Policy, std::plus<float >, float >{
309
424
std::plus<float >()};
310
425
311
426
// get workgroup_size from the device
0 commit comments