intel · bader · Jul 21, 2020 · Jul 1, 2020 · Jul 1, 2020 · Jul 1, 2020
@@ -179,6 +179,7 @@ template <typename T, class BinaryOperation, int Dims, bool IsUSM,
 class reduction_impl;
 
 using cl::sycl::detail::enable_if_t;
+using cl::sycl::detail::queue_impl;
 
 template <typename KernelName, typename KernelType, int Dims, class Reduction,
           typename OutputT>
@@ -191,10 +192,14 @@ enable_if_t<!Reduction::has_fast_atomics>
 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
            Reduction &Redu);
 
-template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<!Reduction::has_fast_atomics>
-reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+template <typename KernelName, typename KernelType, class Reduction>
+enable_if_t<!Reduction::has_fast_atomics, size_t>
+reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
               Reduction &Redu);
+
+__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
+                                      size_t LocalMemBytesPerWorkItem);
+
 } // namespace detail
 } // namespace intel
 
@@ -1028,8 +1033,6 @@ class __SYCL_EXPORT handler {
             int Dims, typename Reduction>
   detail::enable_if_t<!Reduction::has_fast_atomics>
   parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
-    size_t NWorkGroups = Range.get_group_range().size();
-
     // This parallel_for() is lowered to the following sequence:
     // 1) Call a kernel that a) call user's lambda function and b) performs
     //    one iteration of reduction, storing the partial reductions/sums
@@ -1043,42 +1046,51 @@ class __SYCL_EXPORT handler {
     // 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as
     //    necessary to reduce all partial sums into one final sum.
 
+    // Before running the kernels, check that device has enough local memory
+    // to hold local arrays that may be required for the reduction algorithm.
+    // TODO: If the work-group-size is limited by the local memory, then
+    // a special version of the main kernel may be created. The one that would
+    // not use local accessors, which means it would not do the reduction in
+    // the main kernel, but simply generate Range.get_global_range.size() number
+    // of partial sums, leaving the reduction work to the additional/aux
+    // kernels.
+    constexpr bool HFR = Reduction::has_fast_reduce;
+    size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
+    // TODO: currently the maximal work group size is determined for the given
+    // queue/device, while it may be safer to use queries to the kernel compiled
+    // for the device.
+    size_t MaxWGSize = intel::detail::reduGetMaxWGSize(MQueue, OneElemSize);
+    if (Range.get_local_range().size() > MaxWGSize)
+      throw sycl::runtime_error("The implementation handling parallel_for with"
+                                " reduction requires smaller work group size.",
+                                PI_INVALID_WORK_GROUP_SIZE);
+
     // 1. Call the kernel that includes user's lambda function.
     intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
     shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
     this->finalize();
 
-    // 2. Run the additional aux kernel as many times as needed to reduce
+    // 2. Run the additional kernel as many times as needed to reduce
     // all partial sums into one scalar.
 
-    // TODO: user's nd_range and the work-group size specified there must
-    // be honored only for the main kernel that calls user's lambda functions.
-    // There is no need in using the same work-group size in these additional
-    // kernels. Thus, the better strategy here is to make the work-group size
-    // as big as possible to converge/reduce the partial sums into the last
-    // sum faster.
-    size_t WGSize = Range.get_local_range().size();
-    size_t NWorkItems = NWorkGroups;
+    // TODO: Create a special slow/sequential version of the kernel that would
+    // handle the reduction instead of reporting an assert below.
+    if (MaxWGSize <= 1)
+      throw sycl::runtime_error("The implementation handling parallel_for with "
+                                "reduction requires the maximal work group "
+                                "size to be greater than 1 to converge. "
+                                "The maximal work group size depends on the "
+                                "device and the size of the objects passed to "
+                                "the reduction.",
+                                PI_INVALID_WORK_GROUP_SIZE);
+    size_t NWorkItems = Range.get_group_range().size();
     while (NWorkItems > 1) {
-      WGSize = std::min(WGSize, NWorkItems);
-      NWorkGroups = NWorkItems / WGSize;
-      // The last group may be not fully loaded. Still register it as a group.
-      if ((NWorkItems % WGSize) != 0)
-        ++NWorkGroups;
-      nd_range<1> Range(range<1>(WGSize * NWorkGroups), range<1>(WGSize));
-
       handler AuxHandler(QueueCopy, MIsHost);
       AuxHandler.saveCodeLoc(MCodeLoc);
 
-      // The last kernel DOES write to user's accessor passed to reduction.
-      // Associate it with handler manually.
-      if (NWorkGroups == 1 && !Reduction::is_usm)
-        Redu.associateWithHandler(AuxHandler);
-      intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
-                                                           NWorkItems, Redu);
+      NWorkItems = intel::detail::reduAuxCGFunc<KernelName, KernelType>(
+          AuxHandler, NWorkItems, MaxWGSize, Redu);
       MLastEvent = AuxHandler.finalize();
-
-      NWorkItems = NWorkGroups;
     } // end while (NWorkItems > 1)
   }
 

@@ -9,6 +9,7 @@
 #pragma once
 
 #include <CL/sycl/accessor.hpp>
+#include <CL/sycl/handler.hpp>
 #include <CL/sycl/intel/group_algorithm.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
@@ -17,6 +18,11 @@ namespace intel {
 
 namespace detail {
 
+__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
+                                      size_t LocalMemBytesPerWorkItem);
+__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
+                                       size_t &NWorkGroups);
+
 using cl::sycl::detail::bool_constant;
 using cl::sycl::detail::enable_if_t;
 using cl::sycl::detail::is_geninteger16bit;
@@ -867,19 +873,19 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 /// of work-groups. At the end of each work-groups the partial sum is written
 /// to a global buffer.
 ///
-/// Briefly: aux kernel, intel:reduce(), reproducible results,FP + ADD/MIN/MAX
-template <typename KernelName, typename KernelType, int Dims, class Reduction,
-          bool UniformWG, typename InputT, typename OutputT>
+/// Briefly: aux kernel, intel:reduce(), reproducible results, FP + ADD/MIN/MAX
+template <typename KernelName, typename KernelType, bool UniformWG,
+          class Reduction, typename InputT, typename OutputT>
 enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
-reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
-                  Reduction &, InputT In, OutputT Out) {
-  size_t NWorkGroups = Range.get_group_range().size();
-  bool IsUpdateOfUserVar =
-      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
-
+reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
+                  size_t WGSize, Reduction &, InputT In, OutputT Out) {
   using Name = typename get_reduction_aux_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
-  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
+
+  bool IsUpdateOfUserVar =
+      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
+  nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
+  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
     typename Reduction::binary_operation BOp;
     size_t WGID = NDIt.get_group_linear_id();
     size_t GID = NDIt.get_global_linear_id();
@@ -903,14 +909,11 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
 /// to a global buffer.
 ///
 /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
-template <typename KernelName, typename KernelType, int Dims, class Reduction,
-          bool UniformPow2WG, typename InputT, typename OutputT>
+template <typename KernelName, typename KernelType, bool UniformPow2WG,
+          class Reduction, typename InputT, typename OutputT>
 enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
-reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
-                  Reduction &Redu, InputT In, OutputT Out) {
-  size_t WGSize = Range.get_local_range().size();
-  size_t NWorkGroups = Range.get_group_range().size();
-
+reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
+                  size_t WGSize, Reduction &Redu, InputT In, OutputT Out) {
   bool IsUpdateOfUserVar =
       Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
 
@@ -924,7 +927,8 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
   auto ReduIdentity = Redu.getIdentity();
   using Name = typename get_reduction_aux_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
-  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
+  nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
+  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
     size_t WGSize = NDIt.get_local_range().size();
     size_t LID = NDIt.get_local_linear_id();
     size_t GID = NDIt.get_global_linear_id();
@@ -962,12 +966,22 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
   });
 }
 
-template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<!Reduction::has_fast_atomics>
-reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+/// Implements a command group function that enqueues a kernel that does one
+/// iteration of reduction of elements in each of work-groups.
+/// At the end of each work-group the partial sum is written to a global buffer.
+/// The function returns the number of the newly generated partial sums.
+template <typename KernelName, typename KernelType, class Reduction>
+enable_if_t<!Reduction::has_fast_atomics, size_t>
+reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
               Reduction &Redu) {
-  size_t WGSize = Range.get_local_range().size();
-  size_t NWorkGroups = Range.get_group_range().size();
+
+  size_t NWorkGroups;
+  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
+
+  // The last kernel DOES write to user's accessor passed to reduction.
+  // Associate it with handler manually.
+  if (NWorkGroups == 1 && !Reduction::is_usm)
+    Redu.associateWithHandler(CGH);
 
   // The last work-group may be not fully loaded with work, or the work group
   // size may be not power of two. Those two cases considered inefficient
@@ -981,20 +995,21 @@ reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
   auto In = Redu.getReadAccToPreviousPartialReds(CGH);
   if (Reduction::is_usm && NWorkGroups == 1) {
     if (HasUniformWG)
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
-          CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
+      reduAuxCGFuncImpl<KernelName, KernelType, true>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
     else
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
-          CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
+      reduAuxCGFuncImpl<KernelName, KernelType, false>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
   } else {
     auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
     if (HasUniformWG)
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
-          CGH, Range, NWorkItems, Redu, In, Out);
+      reduAuxCGFuncImpl<KernelName, KernelType, true>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
     else
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
-          CGH, Range, NWorkItems, Redu, In, Out);
+      reduAuxCGFuncImpl<KernelName, KernelType, false>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
   }
+  return NWorkGroups;
 }
 
 } // namespace detail

@@ -124,6 +124,7 @@ set(SYCL_SOURCES
     "detail/queue_impl.cpp"
     "detail/os_util.cpp"
     "detail/platform_util.cpp"
+    "detail/reduction.cpp"
     "detail/sampler_impl.cpp"
     "detail/stream_impl.cpp"
     "detail/scheduler/commands.cpp"

@@ -0,0 +1,67 @@
+//==---------------- reduction.cpp - SYCL reduction ------------*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/intel/reduction.hpp>
+#include <detail/queue_impl.hpp>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace intel {
+namespace detail {
+
+// TODO: The algorithm of choosing the work-group size is definitely
+// imperfect now and can be improved.
+__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
+                                       size_t &NWorkGroups) {
+  size_t WGSize = MaxWGSize;
+  if (NWorkItems <= WGSize) {
+    NWorkGroups = 1;
+    WGSize = NWorkItems;
+  } else {
+    NWorkGroups = NWorkItems / WGSize;
+    size_t Rem = NWorkItems % WGSize;
+    if (Rem != 0) {
+      // Let's say MaxWGSize = 128 and NWorkItems is (128+32).
+      // It seems better to have 5 groups 32 work-items each than 2 groups with
+      // 128 work-items in the 1st group and 32 work-items in the 2nd group.
+      size_t NWorkGroupsAlt = NWorkItems / Rem;
+      size_t RemAlt = NWorkItems % Rem;
+      if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
+        NWorkGroups = NWorkGroupsAlt;
+        WGSize = Rem;
+      }
+    } else {
+      NWorkGroups++;
+    }
+  }
+  return WGSize;
+}
+
+__SYCL_EXPORT size_t
+reduGetMaxWGSize(shared_ptr_class<sycl::detail::queue_impl> Queue,
+                 size_t LocalMemBytesPerWorkItem) {
+  device Dev = Queue->get_device();
+  size_t WGSize = Dev.get_info<info::device::max_work_group_size>();
+  if (LocalMemBytesPerWorkItem != 0) {
+    size_t MemSize = Dev.get_info<info::device::local_mem_size>();
+    size_t WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
+
+    // If the work group size is not pow of two, then an additional element
+    // in local memory is needed for the reduction algorithm and thus the real
+    // work-group size requirement per available memory is stricter.
+    if ((WGSize & (WGSize - 1)) == 0)
+      WGSizePerMem--;
+    WGSize = (std::min)(WGSizePerMem, WGSize);
+  }
+  return WGSize;
+}
+
+} // namespace detail
+} // namespace intel
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
@@ -3616,6 +3616,8 @@ _ZN2cl4sycl5eventC1Ev
 _ZN2cl4sycl5eventC2EP9_cl_eventRKNS0_7contextE
 _ZN2cl4sycl5eventC2ESt10shared_ptrINS0_6detail10event_implEE
 _ZN2cl4sycl5eventC2Ev
+_ZN2cl4sycl5intel6detail17reduComputeWGSizeEmmRm
+_ZN2cl4sycl5intel6detail16reduGetMaxWGSizeESt10shared_ptrINS0_6detail10queue_implEEm
 _ZN2cl4sycl5queue10mem_adviseEPKvm14_pi_mem_advice
 _ZN2cl4sycl5queue10wait_proxyERKNS0_6detail13code_locationE
 _ZN2cl4sycl5queue11submit_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationE

@@ -87,7 +87,7 @@ int main() {
 
   // Check with various types.
   test<float, 1, std::multiplies<float>>(1, 8, 256);
-  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 1, 16);
   test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   // Check with CUSTOM type.

@@ -113,6 +113,7 @@ int main() {
   test<float, 0, intel::maximum<>>(getMinimumFPValue<float>(), 7, 7 * 5);
   test<signed char, 0, intel::plus<>>(0, 7, 49);
   test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
+  test<unsigned short, 0, intel::plus<>>(0, 1, 512 + 32);
 #endif // __cplusplus >= 201402L
 
   std::cout << "Test passed\n";