intel
diff --git a/‎sycl/include/CL/sycl/handler.hpp
Lines changed: 112 additions & 154 deletions b/‎sycl/include/CL/sycl/handler.hpp
Lines changed: 112 additions & 154 deletions
@@ -247,37 +247,36 @@ class reduction_impl_algo;
 using cl::sycl::detail::enable_if_t;
 using cl::sycl::detail::queue_impl;
 
-template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
-                size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
-                Reduction &Redu);
+// Kernels with single reduction
 
+/// If we are given sycl::range and not sycl::nd_range we have more freedom in
+/// how to split the iteration space.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<Reduction::has_atomic_add_float64>
-reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
-                   const nd_range<Dims> &Range, Reduction &Redu);
+void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
+                        const range<Dims> &Range, size_t MaxWGSize,
+                        uint32_t NumConcurrentWorkGroups, Reduction &Redu);
 
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<Reduction::has_fast_atomics>
-reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
-           Reduction &Redu);
+void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
+                        const nd_range<Dims> &Range, Reduction &Redu);
 
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<!Reduction::has_fast_atomics>
-reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
-           Reduction &Redu);
+void reduCGFunc(handler &CGH, KernelType KernelFunc,
+                const nd_range<Dims> &Range, Reduction &Redu);
 
-template <typename KernelName, typename KernelType, class Reduction>
-enable_if_t<!Reduction::has_fast_atomics, size_t>
-reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
-              Reduction &Redu);
+// Kernels with multiple reductions
 
+// sycl::nd_range version
 template <typename KernelName, typename KernelType, int Dims,
           typename... Reductions, size_t... Is>
-void reduCGFunc(handler &CGH, KernelType KernelFunc,
-                const nd_range<Dims> &Range,
-                std::tuple<Reductions...> &ReduTuple,
-                std::index_sequence<Is...>);
+void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
+                     const nd_range<Dims> &Range,
+                     std::tuple<Reductions...> &ReduTuple,
+                     std::index_sequence<Is...>);
+
+template <typename KernelName, typename KernelType, class Reduction>
+size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
+                     Reduction &Redu);
 
 template <typename KernelName, typename KernelType, typename... Reductions,
           size_t... Is>
@@ -299,12 +298,6 @@ reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
                              bool IsHost, std::tuple<Reduction...> &ReduTuple,
                              std::index_sequence<Is...>);
 
-template <typename Reduction, typename... RestT>
-std::enable_if_t<!Reduction::is_usm>
-reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
-                                   std::shared_ptr<detail::queue_impl> Queue,
-                                   bool IsHost, Reduction &Redu, RestT... Rest);
-
 __SYCL_EXPORT uint32_t
 reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
 
@@ -469,6 +462,27 @@ class __SYCL_EXPORT handler {
     MStreamStorage.push_back(Stream);
   }
 
+  /// Helper utility for operation widely used throught different reduction
+  /// implementations.
+  /// @{
+  template <class FunctorTy>
+  event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
+                       FunctorTy Func) {
+    handler AuxHandler(Queue, MIsHost);
+    AuxHandler.saveCodeLoc(MCodeLoc);
+    Func(AuxHandler);
+    return AuxHandler.finalize();
+  }
+
+  template <class FunctorTy>
+  static event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
+                              bool IsHost, FunctorTy Func) {
+    handler AuxHandler(Queue, IsHost);
+    Func(AuxHandler);
+    return AuxHandler.finalize();
+  }
+  /// }@
+
   /// Saves buffers created by handling reduction feature in handler.
   /// They are then forwarded to command group and destroyed only after
   /// the command group finishes the work on device/host.
@@ -1586,6 +1600,9 @@ class __SYCL_EXPORT handler {
 #endif
   }
 
+// "if constexpr" simplifies implementation/increases readability in comparison
+// with SFINAE-based approach.
+#if __cplusplus >= 201703L
   /// Defines and invokes a SYCL kernel function for the specified nd_range.
   ///
   /// The SYCL kernel function is defined as a lambda function or a named
@@ -1617,123 +1634,76 @@ class __SYCL_EXPORT handler {
     // for the device.
     size_t MaxWGSize =
         ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
-    ext::oneapi::detail::reduCGFunc<KernelName>(
+    ext::oneapi::detail::reduCGFuncForRange<KernelName>(
         *this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
     if (Reduction::is_usm ||
         (Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
         (!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor())) {
       this->finalize();
-      handler CopyHandler(QueueCopy, MIsHost);
-      CopyHandler.saveCodeLoc(MCodeLoc);
-      ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
-                                                                    Redu);
-      MLastEvent = CopyHandler.finalize();
-    }
-  }
-
-  /// Implements parallel_for() accepting nd_range \p Range and one reduction
-  /// object. This version uses fast sycl::atomic operations to update reduction
-  /// variable at the end of each work-group work.
-  //
-  // If the reduction variable must be initialized with the identity value
-  // before the kernel run, then an additional working accessor is created,
-  // initialized with the identity value and used in the kernel. That working
-  // accessor is then copied to user's accessor or USM pointer after
-  // the kernel run.
-  // For USM pointers without initialize_to_identity properties the same scheme
-  // with working accessor is used as re-using user's USM pointer in the kernel
-  // would require creation of another variant of user's kernel, which does not
-  // seem efficient.
-  template <typename KernelName = detail::auto_name, typename KernelType,
-            int Dims, typename Reduction>
-  detail::enable_if_t<Reduction::has_fast_atomics>
-  parallel_for(nd_range<Dims> Range, Reduction Redu,
-               _KERNELFUNCPARAM(KernelFunc)) {
-    std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
-    ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
-
-    if (Reduction::is_usm || Redu.initializeToIdentity()) {
-      this->finalize();
-      handler CopyHandler(QueueCopy, MIsHost);
-      CopyHandler.saveCodeLoc(MCodeLoc);
-      ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
-                                                                    Redu);
-      MLastEvent = CopyHandler.finalize();
+      MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
+        ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
+            CopyHandler, Redu);
+      });
     }
   }
 
-  /// Implements parallel_for() accepting nd_range \p Range and one reduction
-  /// object. This version is a specialization for the add operator.
-  /// It performs runtime checks for device aspect "atomic64"; if found, fast
-  /// sycl::atomic_ref operations are used to update the reduction at the
-  /// end of each work-group work.  Otherwise the default implementation is
-  /// used.
-  //
-  // If the reduction variable must be initialized with the identity value
-  // before the kernel run, then an additional working accessor is created,
-  // initialized with the identity value and used in the kernel. That working
-  // accessor is then copied to user's accessor or USM pointer after
-  // the kernel run.
-  // For USM pointers without initialize_to_identity properties the same scheme
-  // with working accessor is used as re-using user's USM pointer in the kernel
-  // would require creation of another variant of user's kernel, which does not
-  // seem efficient.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
-  detail::enable_if_t<Reduction::has_atomic_add_float64>
-  parallel_for(nd_range<Dims> Range, Reduction Redu,
-               _KERNELFUNCPARAM(KernelFunc)) {
-
-    std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
-    device D = detail::getDeviceFromHandler(*this);
-
-    if (D.has(aspect::atomic64)) {
-
-      ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
-                                                          Range, Redu);
-
+  void parallel_for(nd_range<Dims> Range, Reduction Redu,
+                    _KERNELFUNCPARAM(KernelFunc)) {
+    if constexpr (!Reduction::has_fast_atomics &&
+                  !Reduction::has_atomic_add_float64) {
+      // The most basic implementation.
+      parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
+      return;
+    } else { // Can't "early" return for "if constexpr".
+      std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
+      if constexpr (Reduction::has_atomic_add_float64) {
+        /// This version is a specialization for the add
+        /// operator. It performs runtime checks for device aspect "atomic64";
+        /// if found, fast sycl::atomic_ref operations are used to update the
+        /// reduction at the end of each work-group work. Otherwise the
+        /// default implementation is used.
+        device D = detail::getDeviceFromHandler(*this);
+
+        if (D.has(aspect::atomic64)) {
+
+          ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
+                                                              Range, Redu);
+        } else {
+          // Resort to basic implementation as well.
+          parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
+          return;
+        }
+      } else {
+        // Use fast sycl::atomic operations to update reduction variable at the
+        // end of each work-group work.
+        ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
+                                                    Redu);
+      }
+      // If the reduction variable must be initialized with the identity value
+      // before the kernel run, then an additional working accessor is created,
+      // initialized with the identity value and used in the kernel. That
+      // working accessor is then copied to user's accessor or USM pointer after
+      // the kernel run.
+      // For USM pointers without initialize_to_identity properties the same
+      // scheme with working accessor is used as re-using user's USM pointer in
+      // the kernel would require creation of another variant of user's kernel,
+      // which does not seem efficient.
       if (Reduction::is_usm || Redu.initializeToIdentity()) {
         this->finalize();
-        handler CopyHandler(QueueCopy, MIsHost);
-        CopyHandler.saveCodeLoc(MCodeLoc);
-        ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
-            CopyHandler, Redu);
-        MLastEvent = CopyHandler.finalize();
+        MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
+          ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
+              CopyHandler, Redu);
+        });
       }
-    } else {
-      parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
     }
   }
 
-  /// Defines and invokes a SYCL kernel function for the specified nd_range.
-  /// Performs reduction operation specified in \p Redu.
-  ///
-  /// The SYCL kernel function is defined as a lambda function or a named
-  /// function object type and given an id or item for indexing in the indexing
-  /// space defined by \p Range.
-  /// If it is a named function object and the function object type is
-  /// globally visible, there is no need for the developer to provide
-  /// a kernel name for it.
-  ///
-  /// TODO: Support HOST. The kernels called by this parallel_for() may use
-  /// some functionality that is not yet supported on HOST such as:
-  /// barrier(), and ext::oneapi::reduce() that also may be used in more
-  /// optimized implementations waiting for their turn of code-review.
-  template <typename KernelName = detail::auto_name, typename KernelType,
-            int Dims, typename Reduction>
-  detail::enable_if_t<!Reduction::has_fast_atomics &&
-                      !Reduction::has_atomic_add_float64>
-  parallel_for(nd_range<Dims> Range, Reduction Redu,
-               _KERNELFUNCPARAM(KernelFunc)) {
-
-    parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
-  }
-
   template <typename KernelName, typename KernelType, int Dims,
             typename Reduction>
-  detail::enable_if_t<!Reduction::has_fast_atomics>
-  parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
-                    KernelType KernelFunc) {
+  void parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
+                         KernelType KernelFunc) {
     // This parallel_for() is lowered to the following sequence:
     // 1) Call a kernel that a) call user's lambda function and b) performs
     //    one iteration of reduction, storing the partial reductions/sums
@@ -1789,20 +1759,17 @@ class __SYCL_EXPORT handler {
                                 PI_ERROR_INVALID_WORK_GROUP_SIZE);
     size_t NWorkItems = Range.get_group_range().size();
     while (NWorkItems > 1) {
-      handler AuxHandler(QueueCopy, MIsHost);
-      AuxHandler.saveCodeLoc(MCodeLoc);
-
-      NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
-          AuxHandler, NWorkItems, MaxWGSize, Redu);
-      MLastEvent = AuxHandler.finalize();
+      MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
+        NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
+            AuxHandler, NWorkItems, MaxWGSize, Redu);
+      });
     } // end while (NWorkItems > 1)
 
     if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor()) {
-      handler CopyHandler(QueueCopy, MIsHost);
-      CopyHandler.saveCodeLoc(MCodeLoc);
-      ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
-                                                                    Redu);
-      MLastEvent = CopyHandler.finalize();
+      MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
+        ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
+            CopyHandler, Redu);
+      });
     }
   }
 
@@ -1867,27 +1834,26 @@ class __SYCL_EXPORT handler {
                                     std::to_string(MaxWGSize),
                                 PI_ERROR_INVALID_WORK_GROUP_SIZE);
 
-    ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
-                                                ReduTuple, ReduIndices);
+    ext::oneapi::detail::reduCGFuncMulti<KernelName>(*this, KernelFunc, Range,
+                                                     ReduTuple, ReduIndices);
     std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
     this->finalize();
 
     size_t NWorkItems = Range.get_group_range().size();
     while (NWorkItems > 1) {
-      handler AuxHandler(QueueCopy, MIsHost);
-      AuxHandler.saveCodeLoc(MCodeLoc);
-
-      NWorkItems =
-          ext::oneapi::detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
-              AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
-      MLastEvent = AuxHandler.finalize();
+      MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
+        NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName,
+                                                        decltype(KernelFunc)>(
+            AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
+      });
     } // end while (NWorkItems > 1)
 
     auto CopyEvent = ext::oneapi::detail::reduSaveFinalResultToUserMem(
         QueueCopy, MIsHost, ReduTuple, ReduIndices);
     if (CopyEvent)
       MLastEvent = *CopyEvent;
   }
+#endif
 
   /// Hierarchical kernel invocation method of a kernel defined as a lambda
   /// encoding the body of each work-group to launch.
@@ -2688,14 +2654,6 @@ class __SYCL_EXPORT handler {
             class Algorithm>
   friend class ext::oneapi::detail::reduction_impl_algo;
 
-  // This method needs to call the method finalize() and also access to private
-  // ctor/dtor.
-  template <typename Reduction, typename... RestT>
-  std::enable_if_t<!Reduction::is_usm> friend ext::oneapi::detail::
-      reduSaveFinalResultToUserMemHelper(
-          std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
-          bool IsHost, Reduction &, RestT...);
-
   friend void detail::associateWithHandler(handler &,
                                            detail::AccessorBaseHost *,
                                            access::target);