Skip to content

Commit 5a4ed2a

Browse files
[NFC][SYCL][Reduction] Inline single-reduction reduCGFunc (#7273)
It had two calls at different depth in the call stack making it harder to reason about. Also, the call from inside reduction_parallel_for_basic_impl made some codepath inside it dead. Inlining allows us to eliminate them.
1 parent 00e3086 commit 5a4ed2a

File tree

1 file changed

+45
-65
lines changed

1 file changed

+45
-65
lines changed

sycl/include/sycl/reduction.hpp

Lines changed: 45 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -2096,30 +2096,6 @@ void reduAuxCGFuncImplArray(
20962096
...);
20972097
}
20982098

2099-
template <typename KernelName, typename KernelType, int Dims,
2100-
typename PropertiesT, class Reduction>
2101-
void reduCGFunc(handler &CGH, KernelType KernelFunc,
2102-
const nd_range<Dims> &Range, PropertiesT Properties,
2103-
Reduction &Redu) {
2104-
if constexpr (Reduction::has_fast_reduce) {
2105-
if constexpr (Reduction::has_fast_atomics) {
2106-
reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName, KernelType>(
2107-
CGH, KernelFunc, Range, Properties, Redu);
2108-
} else {
2109-
reduCGFuncForNDRangeFastReduceOnly<KernelName, KernelType>(
2110-
CGH, KernelFunc, Range, Properties, Redu);
2111-
}
2112-
} else {
2113-
if constexpr (Reduction::has_fast_atomics) {
2114-
reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
2115-
CGH, KernelFunc, Range, Properties, Redu);
2116-
} else {
2117-
reduCGFuncForNDRangeBasic<KernelName, KernelType>(
2118-
CGH, KernelFunc, Range, Properties, Redu);
2119-
}
2120-
}
2121-
}
2122-
21232099
namespace reduction {
21242100
namespace aux_krn {
21252101
template <class KernelName, class Predicate> struct Multi;
@@ -2294,7 +2270,16 @@ void reduction_parallel_for_basic_impl(
22942270
PI_ERROR_INVALID_WORK_GROUP_SIZE);
22952271

22962272
// 1. Call the kernel that includes user's lambda function.
2297-
reduCGFunc<KernelName>(CGH, KernelFunc, Range, Properties, Redu);
2273+
// We only call this basic version when we can't use atomics to do the final
2274+
// reduction.
2275+
assert(!Reduction::has_fast_atomics);
2276+
if constexpr (Reduction::has_fast_reduce) {
2277+
reduCGFuncForNDRangeFastReduceOnly<KernelName, KernelType>(
2278+
CGH, KernelFunc, Range, Properties, Redu);
2279+
} else {
2280+
reduCGFuncForNDRangeBasic<KernelName, KernelType>(CGH, KernelFunc, Range,
2281+
Properties, Redu);
2282+
}
22982283
reduction::finalizeHandler(CGH);
22992284

23002285
// 2. Run the additional kernel as many times as needed to reduce
@@ -2331,49 +2316,44 @@ void reduction_parallel_for(handler &CGH,
23312316
std::shared_ptr<detail::queue_impl> Queue,
23322317
nd_range<Dims> Range, PropertiesT Properties,
23332318
Reduction Redu, KernelType KernelFunc) {
2334-
if constexpr (!Reduction::has_fast_atomics &&
2335-
!Reduction::has_float64_atomics) {
2336-
// The most basic implementation.
2337-
detail::reduction_parallel_for_basic_impl<KernelName>(
2338-
CGH, Queue, Range, Properties, Redu, KernelFunc);
2339-
return;
2340-
} else { // Can't "early" return for "if constexpr".
2341-
if constexpr (Reduction::has_float64_atomics) {
2342-
/// This version is a specialization for the add
2343-
/// operator. It performs runtime checks for device aspect "atomic64";
2344-
/// if found, fast sycl::atomic_ref operations are used to update the
2345-
/// reduction at the end of each work-group work. Otherwise the
2346-
/// default implementation is used.
2347-
device D = detail::getDeviceFromHandler(CGH);
2348-
2349-
if (D.has(aspect::atomic64)) {
2350-
reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
2351-
CGH, KernelFunc, Range, Properties, Redu);
2352-
} else {
2353-
// Resort to basic implementation as well.
2354-
reduction_parallel_for_basic_impl<KernelName>(
2355-
CGH, Queue, Range, Properties, Redu, KernelFunc);
2356-
return;
2357-
}
2319+
if constexpr (Reduction::has_float64_atomics) {
2320+
device D = detail::getDeviceFromHandler(CGH);
2321+
2322+
if (D.has(aspect::atomic64)) {
2323+
reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
2324+
CGH, KernelFunc, Range, Properties, Redu);
23582325
} else {
2359-
// Use fast sycl::atomic operations to update reduction variable at the
2360-
// end of each work-group work.
2361-
reduCGFunc<KernelName>(CGH, KernelFunc, Range, Properties, Redu);
2326+
reduction_parallel_for_basic_impl<KernelName>(
2327+
CGH, Queue, Range, Properties, Redu, KernelFunc);
2328+
return;
23622329
}
2363-
// If the reduction variable must be initialized with the identity value
2364-
// before the kernel run, then an additional working accessor is created,
2365-
// initialized with the identity value and used in the kernel. That
2366-
// working accessor is then copied to user's accessor or USM pointer after
2367-
// the kernel run.
2368-
// For USM pointers without initialize_to_identity properties the same
2369-
// scheme with working accessor is used as re-using user's USM pointer in
2370-
// the kernel would require creation of another variant of user's kernel,
2371-
// which does not seem efficient.
2372-
if (Reduction::is_usm || Redu.initializeToIdentity()) {
2373-
reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
2374-
reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
2375-
});
2330+
} else if constexpr (Reduction::has_fast_atomics) {
2331+
if constexpr (Reduction::has_fast_reduce) {
2332+
reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName, KernelType>(
2333+
CGH, KernelFunc, Range, Properties, Redu);
2334+
} else {
2335+
reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
2336+
CGH, KernelFunc, Range, Properties, Redu);
23762337
}
2338+
} else {
2339+
reduction_parallel_for_basic_impl<KernelName>(CGH, Queue, Range, Properties,
2340+
Redu, KernelFunc);
2341+
return;
2342+
}
2343+
2344+
// If the reduction variable must be initialized with the identity value
2345+
// before the kernel run, then an additional working accessor is created,
2346+
// initialized with the identity value and used in the kernel. That
2347+
// working accessor is then copied to user's accessor or USM pointer after
2348+
// the kernel run.
2349+
// For USM pointers without initialize_to_identity properties the same
2350+
// scheme with working accessor is used as re-using user's USM pointer in
2351+
// the kernel would require creation of another variant of user's kernel,
2352+
// which does not seem efficient.
2353+
if (Reduction::is_usm || Redu.initializeToIdentity()) {
2354+
reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
2355+
reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
2356+
});
23772357
}
23782358
}
23792359

0 commit comments

Comments
 (0)