[SYCL] Fix endless-loop in reduction with 1 element local range (#2022)

v-klochkov · web-flow · commit e6b6ae7530dd · 2020-07-21T11:26:26.000+03:00
The reduction implementation for the data types not having fast atomics
may require running an additional kernel as many times as needed to
converge all partial sums into the last one scalar sum, which possible
only when the work-group size is greater than 1.
The additional kernel used work-group size specified in the original
user's kernel, which is not necessary, and causes endless loop when
local range has only 1 element.

The patch checks the max available work-group size on the device,
it also checks the local memory available and chooses the work-group
size for the additional kernels, which eliminates the endless loop
and makes the converge process faster as bigger work-group size is chosen.

Signed-off-by: Vyacheslav N Klochkov &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
@@ -179,6 +179,7 @@ template <typename T, class BinaryOperation, int Dims, bool IsUSM,
 class reduction_impl;
 
 using cl::sycl::detail::enable_if_t;
+using cl::sycl::detail::queue_impl;
 
 template <typename KernelName, typename KernelType, int Dims, class Reduction,
           typename OutputT>
@@ -191,10 +192,14 @@ enable_if_t<!Reduction::has_fast_atomics>
 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
            Reduction &Redu);
 
-template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<!Reduction::has_fast_atomics>
-reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+template <typename KernelName, typename KernelType, class Reduction>
+enable_if_t<!Reduction::has_fast_atomics, size_t>
+reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
               Reduction &Redu);
+
+__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
+                                      size_t LocalMemBytesPerWorkItem);
+
 } // namespace detail
 } // namespace intel
 
@@ -1036,8 +1041,6 @@ class __SYCL_EXPORT handler {
             int Dims, typename Reduction>
   detail::enable_if_t<!Reduction::has_fast_atomics>
   parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
-    size_t NWorkGroups = Range.get_group_range().size();
-
     // This parallel_for() is lowered to the following sequence:
     // 1) Call a kernel that a) call user's lambda function and b) performs
     //    one iteration of reduction, storing the partial reductions/sums
@@ -1051,42 +1054,51 @@ class __SYCL_EXPORT handler {
     // 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as
     //    necessary to reduce all partial sums into one final sum.
 
+    // Before running the kernels, check that device has enough local memory
+    // to hold local arrays that may be required for the reduction algorithm.
+    // TODO: If the work-group-size is limited by the local memory, then
+    // a special version of the main kernel may be created. The one that would
+    // not use local accessors, which means it would not do the reduction in
+    // the main kernel, but simply generate Range.get_global_range.size() number
+    // of partial sums, leaving the reduction work to the additional/aux
+    // kernels.
+    constexpr bool HFR = Reduction::has_fast_reduce;
+    size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
+    // TODO: currently the maximal work group size is determined for the given
+    // queue/device, while it may be safer to use queries to the kernel compiled
+    // for the device.
+    size_t MaxWGSize = intel::detail::reduGetMaxWGSize(MQueue, OneElemSize);
+    if (Range.get_local_range().size() > MaxWGSize)
+      throw sycl::runtime_error("The implementation handling parallel_for with"
+                                " reduction requires smaller work group size.",
+                                PI_INVALID_WORK_GROUP_SIZE);
+
     // 1. Call the kernel that includes user's lambda function.
     intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
     shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
     this->finalize();
 
-    // 2. Run the additional aux kernel as many times as needed to reduce
+    // 2. Run the additional kernel as many times as needed to reduce
     // all partial sums into one scalar.
 
-    // TODO: user's nd_range and the work-group size specified there must
-    // be honored only for the main kernel that calls user's lambda functions.
-    // There is no need in using the same work-group size in these additional
-    // kernels. Thus, the better strategy here is to make the work-group size
-    // as big as possible to converge/reduce the partial sums into the last
-    // sum faster.
-    size_t WGSize = Range.get_local_range().size();
-    size_t NWorkItems = NWorkGroups;
+    // TODO: Create a special slow/sequential version of the kernel that would
+    // handle the reduction instead of reporting an assert below.
+    if (MaxWGSize <= 1)
+      throw sycl::runtime_error("The implementation handling parallel_for with "
+                                "reduction requires the maximal work group "
+                                "size to be greater than 1 to converge. "
+                                "The maximal work group size depends on the "
+                                "device and the size of the objects passed to "
+                                "the reduction.",
+                                PI_INVALID_WORK_GROUP_SIZE);
+    size_t NWorkItems = Range.get_group_range().size();
     while (NWorkItems > 1) {
-      WGSize = std::min(WGSize, NWorkItems);
-      NWorkGroups = NWorkItems / WGSize;
-      // The last group may be not fully loaded. Still register it as a group.
-      if ((NWorkItems % WGSize) != 0)
-        ++NWorkGroups;
-      nd_range<1> Range(range<1>(WGSize * NWorkGroups), range<1>(WGSize));
-
       handler AuxHandler(QueueCopy, MIsHost);
       AuxHandler.saveCodeLoc(MCodeLoc);
 
-      // The last kernel DOES write to user's accessor passed to reduction.
-      // Associate it with handler manually.
-      if (NWorkGroups == 1 && !Reduction::is_usm)
-        Redu.associateWithHandler(AuxHandler);
-      intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
-                                                           NWorkItems, Redu);
+      NWorkItems = intel::detail::reduAuxCGFunc<KernelName, KernelType>(
+          AuxHandler, NWorkItems, MaxWGSize, Redu);
       MLastEvent = AuxHandler.finalize();
-
-      NWorkItems = NWorkGroups;
     } // end while (NWorkItems > 1)
   }
 
diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <CL/sycl/accessor.hpp>
+#include <CL/sycl/handler.hpp>
 #include <CL/sycl/intel/group_algorithm.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
@@ -17,6 +18,11 @@ namespace intel {
 
 namespace detail {
 
+__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
+                                      size_t LocalMemBytesPerWorkItem);
+__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
+                                       size_t &NWorkGroups);
+
 using cl::sycl::detail::bool_constant;
 using cl::sycl::detail::enable_if_t;
 using cl::sycl::detail::is_geninteger16bit;
@@ -867,19 +873,19 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 /// of work-groups. At the end of each work-groups the partial sum is written
 /// to a global buffer.
 ///
-/// Briefly: aux kernel, intel:reduce(), reproducible results,FP + ADD/MIN/MAX
-template <typename KernelName, typename KernelType, int Dims, class Reduction,
-          bool UniformWG, typename InputT, typename OutputT>
+/// Briefly: aux kernel, intel:reduce(), reproducible results, FP + ADD/MIN/MAX
+template <typename KernelName, typename KernelType, bool UniformWG,
+          class Reduction, typename InputT, typename OutputT>
 enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
-reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
-                  Reduction &, InputT In, OutputT Out) {
-  size_t NWorkGroups = Range.get_group_range().size();
-  bool IsUpdateOfUserVar =
-      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
-
+reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
+                  size_t WGSize, Reduction &, InputT In, OutputT Out) {
   using Name = typename get_reduction_aux_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
-  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
+
+  bool IsUpdateOfUserVar =
+      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
+  nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
+  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
     typename Reduction::binary_operation BOp;
     size_t WGID = NDIt.get_group_linear_id();
     size_t GID = NDIt.get_global_linear_id();
@@ -903,14 +909,11 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
 /// to a global buffer.
 ///
 /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
-template <typename KernelName, typename KernelType, int Dims, class Reduction,
-          bool UniformPow2WG, typename InputT, typename OutputT>
+template <typename KernelName, typename KernelType, bool UniformPow2WG,
+          class Reduction, typename InputT, typename OutputT>
 enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
-reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
-                  Reduction &Redu, InputT In, OutputT Out) {
-  size_t WGSize = Range.get_local_range().size();
-  size_t NWorkGroups = Range.get_group_range().size();
-
+reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
+                  size_t WGSize, Reduction &Redu, InputT In, OutputT Out) {
   bool IsUpdateOfUserVar =
       Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
 
@@ -924,7 +927,8 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
   auto ReduIdentity = Redu.getIdentity();
   using Name = typename get_reduction_aux_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
-  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
+  nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
+  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
     size_t WGSize = NDIt.get_local_range().size();
     size_t LID = NDIt.get_local_linear_id();
     size_t GID = NDIt.get_global_linear_id();
@@ -962,12 +966,22 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
   });
 }
 
-template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<!Reduction::has_fast_atomics>
-reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+/// Implements a command group function that enqueues a kernel that does one
+/// iteration of reduction of elements in each of work-groups.
+/// At the end of each work-group the partial sum is written to a global buffer.
+/// The function returns the number of the newly generated partial sums.
+template <typename KernelName, typename KernelType, class Reduction>
+enable_if_t<!Reduction::has_fast_atomics, size_t>
+reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
               Reduction &Redu) {
-  size_t WGSize = Range.get_local_range().size();
-  size_t NWorkGroups = Range.get_group_range().size();
+
+  size_t NWorkGroups;
+  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
+
+  // The last kernel DOES write to user's accessor passed to reduction.
+  // Associate it with handler manually.
+  if (NWorkGroups == 1 && !Reduction::is_usm)
+    Redu.associateWithHandler(CGH);
 
   // The last work-group may be not fully loaded with work, or the work group
   // size may be not power of two. Those two cases considered inefficient
@@ -981,20 +995,21 @@ reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
   auto In = Redu.getReadAccToPreviousPartialReds(CGH);
   if (Reduction::is_usm && NWorkGroups == 1) {
     if (HasUniformWG)
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
-          CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
+      reduAuxCGFuncImpl<KernelName, KernelType, true>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
     else
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
-          CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
+      reduAuxCGFuncImpl<KernelName, KernelType, false>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
   } else {
     auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
     if (HasUniformWG)
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
-          CGH, Range, NWorkItems, Redu, In, Out);
+      reduAuxCGFuncImpl<KernelName, KernelType, true>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
     else
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
-          CGH, Range, NWorkItems, Redu, In, Out);
+      reduAuxCGFuncImpl<KernelName, KernelType, false>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
   }
+  return NWorkGroups;
 }
 
 } // namespace detail
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
@@ -125,6 +125,7 @@ set(SYCL_SOURCES
     "detail/queue_impl.cpp"
     "detail/os_util.cpp"
     "detail/platform_util.cpp"
+    "detail/reduction.cpp"
     "detail/sampler_impl.cpp"
     "detail/stream_impl.cpp"
     "detail/scheduler/commands.cpp"
diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp
@@ -0,0 +1,67 @@
+//==---------------- reduction.cpp - SYCL reduction ------------*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/intel/reduction.hpp>
+#include <detail/queue_impl.hpp>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace intel {
+namespace detail {
+
+// TODO: The algorithm of choosing the work-group size is definitely
+// imperfect now and can be improved.
+__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
+                                       size_t &NWorkGroups) {
+  size_t WGSize = MaxWGSize;
+  if (NWorkItems <= WGSize) {
+    NWorkGroups = 1;
+    WGSize = NWorkItems;
+  } else {
+    NWorkGroups = NWorkItems / WGSize;
+    size_t Rem = NWorkItems % WGSize;
+    if (Rem != 0) {
+      // Let's say MaxWGSize = 128 and NWorkItems is (128+32).
+      // It seems better to have 5 groups 32 work-items each than 2 groups with
+      // 128 work-items in the 1st group and 32 work-items in the 2nd group.
+      size_t NWorkGroupsAlt = NWorkItems / Rem;
+      size_t RemAlt = NWorkItems % Rem;
+      if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
+        NWorkGroups = NWorkGroupsAlt;
+        WGSize = Rem;
+      }
+    } else {
+      NWorkGroups++;
+    }
+  }
+  return WGSize;
+}
+
+__SYCL_EXPORT size_t
+reduGetMaxWGSize(shared_ptr_class<sycl::detail::queue_impl> Queue,
+                 size_t LocalMemBytesPerWorkItem) {
+  device Dev = Queue->get_device();
+  size_t WGSize = Dev.get_info<info::device::max_work_group_size>();
+  if (LocalMemBytesPerWorkItem != 0) {
+    size_t MemSize = Dev.get_info<info::device::local_mem_size>();
+    size_t WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
+
+    // If the work group size is not pow of two, then an additional element
+    // in local memory is needed for the reduction algorithm and thus the real
+    // work-group size requirement per available memory is stricter.
+    if ((WGSize & (WGSize - 1)) == 0)
+      WGSizePerMem--;
+    WGSize = (std::min)(WGSizePerMem, WGSize);
+  }
+  return WGSize;
+}
+
+} // namespace detail
+} // namespace intel
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3621,6 +3621,8 @@ _ZN2cl4sycl5eventC1Ev
 _ZN2cl4sycl5eventC2EP9_cl_eventRKNS0_7contextE
 _ZN2cl4sycl5eventC2ESt10shared_ptrINS0_6detail10event_implEE
 _ZN2cl4sycl5eventC2Ev
+_ZN2cl4sycl5intel6detail17reduComputeWGSizeEmmRm
+_ZN2cl4sycl5intel6detail16reduGetMaxWGSizeESt10shared_ptrINS0_6detail10queue_implEEm
 _ZN2cl4sycl5queue10mem_adviseEPKvm14_pi_mem_advice
 _ZN2cl4sycl5queue10wait_proxyERKNS0_6detail13code_locationE
 _ZN2cl4sycl5queue11submit_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationE
diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp
@@ -87,7 +87,7 @@ int main() {
 
   // Check with various types.
   test<float, 1, std::multiplies<float>>(1, 8, 256);
-  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 1, 16);
   test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   // Check with CUSTOM type.
diff --git a/sycl/test/reduction/reduction_transparent.cpp b/sycl/test/reduction/reduction_transparent.cpp
@@ -113,6 +113,7 @@ int main() {
   test<float, 0, intel::maximum<>>(getMinimumFPValue<float>(), 7, 7 * 5);
   test<signed char, 0, intel::plus<>>(0, 7, 49);
   test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
+  test<unsigned short, 0, intel::plus<>>(0, 1, 512 + 32);
 #endif // __cplusplus >= 201402L
 
   std::cout << "Test passed\n";