[NFC][SYCL][Reduction] Inline single-reduction reduCGFunc (#7273)

aelovikov-intel · web-flow · commit 5a4ed2a6f40e · 2022-11-03T19:24:15.000+01:00
It had two calls at different depth in the call stack making it harder
to reason about.

Also, the call from inside reduction_parallel_for_basic_impl made some
codepath inside it dead. Inlining allows us to eliminate them.
diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp
@@ -2096,30 +2096,6 @@ void reduAuxCGFuncImplArray(
    ...);
 }
 
-template <typename KernelName, typename KernelType, int Dims,
-          typename PropertiesT, class Reduction>
-void reduCGFunc(handler &CGH, KernelType KernelFunc,
-                const nd_range<Dims> &Range, PropertiesT Properties,
-                Reduction &Redu) {
-  if constexpr (Reduction::has_fast_reduce) {
-    if constexpr (Reduction::has_fast_atomics) {
-      reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName, KernelType>(
-          CGH, KernelFunc, Range, Properties, Redu);
-    } else {
-      reduCGFuncForNDRangeFastReduceOnly<KernelName, KernelType>(
-          CGH, KernelFunc, Range, Properties, Redu);
-    }
-  } else {
-    if constexpr (Reduction::has_fast_atomics) {
-      reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
-          CGH, KernelFunc, Range, Properties, Redu);
-    } else {
-      reduCGFuncForNDRangeBasic<KernelName, KernelType>(
-          CGH, KernelFunc, Range, Properties, Redu);
-    }
-  }
-}
-
 namespace reduction {
 namespace aux_krn {
 template <class KernelName, class Predicate> struct Multi;
@@ -2294,7 +2270,16 @@ void reduction_parallel_for_basic_impl(
                               PI_ERROR_INVALID_WORK_GROUP_SIZE);
 
   // 1. Call the kernel that includes user's lambda function.
-  reduCGFunc<KernelName>(CGH, KernelFunc, Range, Properties, Redu);
+  // We only call this basic version when we can't use atomics to do the final
+  // reduction.
+  assert(!Reduction::has_fast_atomics);
+  if constexpr (Reduction::has_fast_reduce) {
+    reduCGFuncForNDRangeFastReduceOnly<KernelName, KernelType>(
+        CGH, KernelFunc, Range, Properties, Redu);
+  } else {
+    reduCGFuncForNDRangeBasic<KernelName, KernelType>(CGH, KernelFunc, Range,
+                                                      Properties, Redu);
+  }
   reduction::finalizeHandler(CGH);
 
   // 2. Run the additional kernel as many times as needed to reduce
@@ -2331,49 +2316,44 @@ void reduction_parallel_for(handler &CGH,
                             std::shared_ptr<detail::queue_impl> Queue,
                             nd_range<Dims> Range, PropertiesT Properties,
                             Reduction Redu, KernelType KernelFunc) {
-  if constexpr (!Reduction::has_fast_atomics &&
-                !Reduction::has_float64_atomics) {
-    // The most basic implementation.
-    detail::reduction_parallel_for_basic_impl<KernelName>(
-        CGH, Queue, Range, Properties, Redu, KernelFunc);
-    return;
-  } else { // Can't "early" return for "if constexpr".
-    if constexpr (Reduction::has_float64_atomics) {
-      /// This version is a specialization for the add
-      /// operator. It performs runtime checks for device aspect "atomic64";
-      /// if found, fast sycl::atomic_ref operations are used to update the
-      /// reduction at the end of each work-group work. Otherwise the
-      /// default implementation is used.
-      device D = detail::getDeviceFromHandler(CGH);
-
-      if (D.has(aspect::atomic64)) {
-        reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
-            CGH, KernelFunc, Range, Properties, Redu);
-      } else {
-        // Resort to basic implementation as well.
-        reduction_parallel_for_basic_impl<KernelName>(
-            CGH, Queue, Range, Properties, Redu, KernelFunc);
-        return;
-      }
+  if constexpr (Reduction::has_float64_atomics) {
+    device D = detail::getDeviceFromHandler(CGH);
+
+    if (D.has(aspect::atomic64)) {
+      reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
+          CGH, KernelFunc, Range, Properties, Redu);
     } else {
-      // Use fast sycl::atomic operations to update reduction variable at the
-      // end of each work-group work.
-      reduCGFunc<KernelName>(CGH, KernelFunc, Range, Properties, Redu);
+      reduction_parallel_for_basic_impl<KernelName>(
+          CGH, Queue, Range, Properties, Redu, KernelFunc);
+      return;
     }
-    // If the reduction variable must be initialized with the identity value
-    // before the kernel run, then an additional working accessor is created,
-    // initialized with the identity value and used in the kernel. That
-    // working accessor is then copied to user's accessor or USM pointer after
-    // the kernel run.
-    // For USM pointers without initialize_to_identity properties the same
-    // scheme with working accessor is used as re-using user's USM pointer in
-    // the kernel would require creation of another variant of user's kernel,
-    // which does not seem efficient.
-    if (Reduction::is_usm || Redu.initializeToIdentity()) {
-      reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
-        reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
-      });
+  } else if constexpr (Reduction::has_fast_atomics) {
+    if constexpr (Reduction::has_fast_reduce) {
+      reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName, KernelType>(
+          CGH, KernelFunc, Range, Properties, Redu);
+    } else {
+      reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
+          CGH, KernelFunc, Range, Properties, Redu);
     }
+  } else {
+    reduction_parallel_for_basic_impl<KernelName>(CGH, Queue, Range, Properties,
+                                                  Redu, KernelFunc);
+    return;
+  }
+
+  // If the reduction variable must be initialized with the identity value
+  // before the kernel run, then an additional working accessor is created,
+  // initialized with the identity value and used in the kernel. That
+  // working accessor is then copied to user's accessor or USM pointer after
+  // the kernel run.
+  // For USM pointers without initialize_to_identity properties the same
+  // scheme with working accessor is used as re-using user's USM pointer in
+  // the kernel would require creation of another variant of user's kernel,
+  // which does not seem efficient.
+  if (Reduction::is_usm || Redu.initializeToIdentity()) {
+    reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
+      reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
+    });
   }
 }