@@ -2096,30 +2096,6 @@ void reduAuxCGFuncImplArray(
2096
2096
...);
2097
2097
}
2098
2098
2099
- template <typename KernelName, typename KernelType, int Dims,
2100
- typename PropertiesT, class Reduction >
2101
- void reduCGFunc (handler &CGH, KernelType KernelFunc,
2102
- const nd_range<Dims> &Range, PropertiesT Properties,
2103
- Reduction &Redu) {
2104
- if constexpr (Reduction::has_fast_reduce) {
2105
- if constexpr (Reduction::has_fast_atomics) {
2106
- reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName, KernelType>(
2107
- CGH, KernelFunc, Range, Properties, Redu);
2108
- } else {
2109
- reduCGFuncForNDRangeFastReduceOnly<KernelName, KernelType>(
2110
- CGH, KernelFunc, Range, Properties, Redu);
2111
- }
2112
- } else {
2113
- if constexpr (Reduction::has_fast_atomics) {
2114
- reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
2115
- CGH, KernelFunc, Range, Properties, Redu);
2116
- } else {
2117
- reduCGFuncForNDRangeBasic<KernelName, KernelType>(
2118
- CGH, KernelFunc, Range, Properties, Redu);
2119
- }
2120
- }
2121
- }
2122
-
2123
2099
namespace reduction {
2124
2100
namespace aux_krn {
2125
2101
template <class KernelName , class Predicate > struct Multi ;
@@ -2294,7 +2270,16 @@ void reduction_parallel_for_basic_impl(
2294
2270
PI_ERROR_INVALID_WORK_GROUP_SIZE);
2295
2271
2296
2272
// 1. Call the kernel that includes user's lambda function.
2297
- reduCGFunc<KernelName>(CGH, KernelFunc, Range, Properties, Redu);
2273
+ // We only call this basic version when we can't use atomics to do the final
2274
+ // reduction.
2275
+ assert (!Reduction::has_fast_atomics);
2276
+ if constexpr (Reduction::has_fast_reduce) {
2277
+ reduCGFuncForNDRangeFastReduceOnly<KernelName, KernelType>(
2278
+ CGH, KernelFunc, Range, Properties, Redu);
2279
+ } else {
2280
+ reduCGFuncForNDRangeBasic<KernelName, KernelType>(CGH, KernelFunc, Range,
2281
+ Properties, Redu);
2282
+ }
2298
2283
reduction::finalizeHandler (CGH);
2299
2284
2300
2285
// 2. Run the additional kernel as many times as needed to reduce
@@ -2331,49 +2316,44 @@ void reduction_parallel_for(handler &CGH,
2331
2316
std::shared_ptr<detail::queue_impl> Queue,
2332
2317
nd_range<Dims> Range, PropertiesT Properties,
2333
2318
Reduction Redu, KernelType KernelFunc) {
2334
- if constexpr (!Reduction::has_fast_atomics &&
2335
- !Reduction::has_float64_atomics) {
2336
- // The most basic implementation.
2337
- detail::reduction_parallel_for_basic_impl<KernelName>(
2338
- CGH, Queue, Range, Properties, Redu, KernelFunc);
2339
- return ;
2340
- } else { // Can't "early" return for "if constexpr".
2341
- if constexpr (Reduction::has_float64_atomics) {
2342
- // / This version is a specialization for the add
2343
- // / operator. It performs runtime checks for device aspect "atomic64";
2344
- // / if found, fast sycl::atomic_ref operations are used to update the
2345
- // / reduction at the end of each work-group work. Otherwise the
2346
- // / default implementation is used.
2347
- device D = detail::getDeviceFromHandler (CGH);
2348
-
2349
- if (D.has (aspect::atomic64)) {
2350
- reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
2351
- CGH, KernelFunc, Range, Properties, Redu);
2352
- } else {
2353
- // Resort to basic implementation as well.
2354
- reduction_parallel_for_basic_impl<KernelName>(
2355
- CGH, Queue, Range, Properties, Redu, KernelFunc);
2356
- return ;
2357
- }
2319
+ if constexpr (Reduction::has_float64_atomics) {
2320
+ device D = detail::getDeviceFromHandler (CGH);
2321
+
2322
+ if (D.has (aspect::atomic64)) {
2323
+ reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
2324
+ CGH, KernelFunc, Range, Properties, Redu);
2358
2325
} else {
2359
- // Use fast sycl::atomic operations to update reduction variable at the
2360
- // end of each work-group work.
2361
- reduCGFunc<KernelName>(CGH, KernelFunc, Range, Properties, Redu) ;
2326
+ reduction_parallel_for_basic_impl<KernelName>(
2327
+ CGH, Queue, Range, Properties, Redu, KernelFunc);
2328
+ return ;
2362
2329
}
2363
- // If the reduction variable must be initialized with the identity value
2364
- // before the kernel run, then an additional working accessor is created,
2365
- // initialized with the identity value and used in the kernel. That
2366
- // working accessor is then copied to user's accessor or USM pointer after
2367
- // the kernel run.
2368
- // For USM pointers without initialize_to_identity properties the same
2369
- // scheme with working accessor is used as re-using user's USM pointer in
2370
- // the kernel would require creation of another variant of user's kernel,
2371
- // which does not seem efficient.
2372
- if (Reduction::is_usm || Redu.initializeToIdentity ()) {
2373
- reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
2374
- reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
2375
- });
2330
+ } else if constexpr (Reduction::has_fast_atomics) {
2331
+ if constexpr (Reduction::has_fast_reduce) {
2332
+ reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName, KernelType>(
2333
+ CGH, KernelFunc, Range, Properties, Redu);
2334
+ } else {
2335
+ reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
2336
+ CGH, KernelFunc, Range, Properties, Redu);
2376
2337
}
2338
+ } else {
2339
+ reduction_parallel_for_basic_impl<KernelName>(CGH, Queue, Range, Properties,
2340
+ Redu, KernelFunc);
2341
+ return ;
2342
+ }
2343
+
2344
+ // If the reduction variable must be initialized with the identity value
2345
+ // before the kernel run, then an additional working accessor is created,
2346
+ // initialized with the identity value and used in the kernel. That
2347
+ // working accessor is then copied to user's accessor or USM pointer after
2348
+ // the kernel run.
2349
+ // For USM pointers without initialize_to_identity properties the same
2350
+ // scheme with working accessor is used as re-using user's USM pointer in
2351
+ // the kernel would require creation of another variant of user's kernel,
2352
+ // which does not seem efficient.
2353
+ if (Reduction::is_usm || Redu.initializeToIdentity ()) {
2354
+ reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
2355
+ reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
2356
+ });
2377
2357
}
2378
2358
}
2379
2359
0 commit comments