[SYCL] Fix endless-loop in reduction with nd_range having 1 element local range

v-klochkov · v-klochkov · commit d5620dd85639 · 2020-06-30T23:33:53.000-07:00
The reduction implementation for the data types not having fast atomics
may require running an additional kernel as many times as needed to
converge all partial sums into the last one scalar sum, which possible
only when the work-group size is greater than 1.
The additional kernel used work-group size specified in the original
user's kernel, which is not necessary, and causes endless loop when
local range has only 1 element.

The patch checks the max available work-group size on the device,
it also checks the local memory available and chooses the work-group
size for the additional kernels, which eliminates the endless loop
and makes the converge process faster as bigger work-group size is chosen.

Signed-off-by: Vyacheslav N Klochkov &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
@@ -179,6 +179,7 @@ template <typename T, class BinaryOperation, int Dims, bool IsUSM,
 class reduction_impl;
 
 using cl::sycl::detail::enable_if_t;
+using cl::sycl::detail::queue_impl;
 
 template <typename KernelName, typename KernelType, int Dims, class Reduction,
           typename OutputT>
@@ -191,10 +192,14 @@ enable_if_t<!Reduction::has_fast_atomics>
 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
            Reduction &Redu);
 
-template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<!Reduction::has_fast_atomics>
-reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+template <typename KernelName, typename KernelType, class Reduction>
+enable_if_t<!Reduction::has_fast_atomics, size_t>
+reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
               Reduction &Redu);
+
+__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
+                                      size_t LocalMemBytesPerWorkItem);
+
 } // namespace detail
 } // namespace intel
 
@@ -1048,37 +1053,26 @@ class __SYCL_EXPORT handler {
     shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
     this->finalize();
 
-    // 2. Run the additional aux kernel as many times as needed to reduce
-    // all partial sums into one scalar.
+    // 2. Find the maximal work group size usable for the additional kernel(s).
+    size_t MaxWGSize;
+    if (NWorkGroups > 1) {
+      constexpr bool HFR = Reduction::has_fast_reduce;
+      size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
+      MaxWGSize = intel::detail::reduGetMaxWGSize(QueueCopy, OneElemSize);
+      assert(MaxWGSize > 1 &&
+             "Work group size must be greater than 1 to avoid endless loop.");
+    }
 
-    // TODO: user's nd_range and the work-group size specified there must
-    // be honored only for the main kernel that calls user's lambda functions.
-    // There is no need in using the same work-group size in these additional
-    // kernels. Thus, the better strategy here is to make the work-group size
-    // as big as possible to converge/reduce the partial sums into the last
-    // sum faster.
-    size_t WGSize = Range.get_local_range().size();
+    // 3. Run the additional kernel as many times as needed to reduce
+    // all partial sums into one scalar.
     size_t NWorkItems = NWorkGroups;
     while (NWorkItems > 1) {
-      WGSize = std::min(WGSize, NWorkItems);
-      NWorkGroups = NWorkItems / WGSize;
-      // The last group may be not fully loaded. Still register it as a group.
-      if ((NWorkItems % WGSize) != 0)
-        ++NWorkGroups;
-      nd_range<1> Range(range<1>(WGSize * NWorkGroups), range<1>(WGSize));
-
       handler AuxHandler(QueueCopy, MIsHost);
       AuxHandler.saveCodeLoc(MCodeLoc);
 
-      // The last kernel DOES write to user's accessor passed to reduction.
-      // Associate it with handler manually.
-      if (NWorkGroups == 1 && !Reduction::is_usm)
-        Redu.associateWithHandler(AuxHandler);
-      intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
-                                                           NWorkItems, Redu);
+      NWorkItems = intel::detail::reduAuxCGFunc<KernelName, KernelType>(
+          AuxHandler, NWorkItems, MaxWGSize, Redu);
       MLastEvent = AuxHandler.finalize();
-
-      NWorkItems = NWorkGroups;
     } // end while (NWorkItems > 1)
   }
 
diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <CL/sycl/accessor.hpp>
+#include <CL/sycl/handler.hpp>
 #include <CL/sycl/intel/group_algorithm.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
@@ -17,6 +18,11 @@ namespace intel {
 
 namespace detail {
 
+__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
+                                      size_t LocalMemBytesPerWorkItem);
+__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
+                                       size_t &NWorkGroups);
+
 using cl::sycl::detail::bool_constant;
 using cl::sycl::detail::enable_if_t;
 using cl::sycl::detail::is_geninteger16bit;
@@ -867,19 +873,19 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 /// of work-groups. At the end of each work-groups the partial sum is written
 /// to a global buffer.
 ///
-/// Briefly: aux kernel, intel:reduce(), reproducible results,FP + ADD/MIN/MAX
-template <typename KernelName, typename KernelType, int Dims, class Reduction,
-          bool UniformWG, typename InputT, typename OutputT>
+/// Briefly: aux kernel, intel:reduce(), reproducible results, FP + ADD/MIN/MAX
+template <typename KernelName, typename KernelType, bool UniformWG,
+          class Reduction, typename InputT, typename OutputT>
 enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
-reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
-                  Reduction &, InputT In, OutputT Out) {
-  size_t NWorkGroups = Range.get_group_range().size();
-  bool IsUpdateOfUserVar =
-      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
-
+reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
+                  size_t WGSize, Reduction &, InputT In, OutputT Out) {
   using Name = typename get_reduction_aux_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
-  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
+
+  bool IsUpdateOfUserVar =
+      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
+  nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
+  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
     typename Reduction::binary_operation BOp;
     size_t WGID = NDIt.get_group_linear_id();
     size_t GID = NDIt.get_global_linear_id();
@@ -903,14 +909,11 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
 /// to a global buffer.
 ///
 /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
-template <typename KernelName, typename KernelType, int Dims, class Reduction,
-          bool UniformPow2WG, typename InputT, typename OutputT>
+template <typename KernelName, typename KernelType, bool UniformPow2WG,
+          class Reduction, typename InputT, typename OutputT>
 enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
-reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
-                  Reduction &Redu, InputT In, OutputT Out) {
-  size_t WGSize = Range.get_local_range().size();
-  size_t NWorkGroups = Range.get_group_range().size();
-
+reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
+                  size_t WGSize, Reduction &Redu, InputT In, OutputT Out) {
   bool IsUpdateOfUserVar =
       Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
 
@@ -924,7 +927,8 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
   auto ReduIdentity = Redu.getIdentity();
   using Name = typename get_reduction_aux_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
-  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
+  nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
+  CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
     size_t WGSize = NDIt.get_local_range().size();
     size_t LID = NDIt.get_local_linear_id();
     size_t GID = NDIt.get_global_linear_id();
@@ -962,12 +966,22 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
   });
 }
 
-template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<!Reduction::has_fast_atomics>
-reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+/// Implements a command group function that enqueues a kernel that does one
+/// iteration of reduction of elements in each of work-groups.
+/// At the end of each work-group the partial sum is written to a global buffer.
+/// The function returns the number of the newly generated partial sums.
+template <typename KernelName, typename KernelType, class Reduction>
+enable_if_t<!Reduction::has_fast_atomics, size_t>
+reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
               Reduction &Redu) {
-  size_t WGSize = Range.get_local_range().size();
-  size_t NWorkGroups = Range.get_group_range().size();
+
+  size_t NWorkGroups;
+  size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
+
+  // The last kernel DOES write to user's accessor passed to reduction.
+  // Associate it with handler manually.
+  if (NWorkGroups == 1 && !Reduction::is_usm)
+    Redu.associateWithHandler(CGH);
 
   // The last work-group may be not fully loaded with work, or the work group
   // size may be not power of two. Those two cases considered inefficient
@@ -981,20 +995,21 @@ reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
   auto In = Redu.getReadAccToPreviousPartialReds(CGH);
   if (Reduction::is_usm && NWorkGroups == 1) {
     if (HasUniformWG)
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
-          CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
+      reduAuxCGFuncImpl<KernelName, KernelType, true>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
     else
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
-          CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
+      reduAuxCGFuncImpl<KernelName, KernelType, false>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
   } else {
     auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
     if (HasUniformWG)
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
-          CGH, Range, NWorkItems, Redu, In, Out);
+      reduAuxCGFuncImpl<KernelName, KernelType, true>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
     else
-      reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
-          CGH, Range, NWorkItems, Redu, In, Out);
+      reduAuxCGFuncImpl<KernelName, KernelType, false>(
+          CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
   }
+  return NWorkGroups;
 }
 
 } // namespace detail
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
@@ -124,6 +124,7 @@ set(SYCL_SOURCES
     "detail/queue_impl.cpp"
     "detail/os_util.cpp"
     "detail/platform_util.cpp"
+    "detail/reduction.cpp"
     "detail/sampler_impl.cpp"
     "detail/stream_impl.cpp"
     "detail/scheduler/commands.cpp"
diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp
@@ -0,0 +1,67 @@
+//==---------------- reduction.cpp - SYCL reduction ------------*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/intel/reduction.hpp>
+#include <detail/queue_impl.hpp>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace intel {
+namespace detail {
+
+// TODO: The algorithm of choosing the work-group size is definitely
+// imperfect now and can be improved.
+__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
+                                       size_t &NWorkGroups) {
+  size_t WGSize = MaxWGSize;
+  if (NWorkItems <= WGSize) {
+    NWorkGroups = 1;
+    WGSize = NWorkItems;
+  } else {
+    NWorkGroups = NWorkItems / WGSize;
+    size_t Rem = NWorkItems % WGSize;
+    if (Rem != 0) {
+      // Let's say MaxWGSize = 128 and NWorkItems is (128+32).
+      // It seems better to have 5 groups 32 work-items each than 2 groups with
+      // 128 work-items in the 1st group and 32 work-items in the 2nd group.
+      size_t NWorkGroupsAlt = NWorkItems / Rem;
+      size_t RemAlt = NWorkItems % Rem;
+      if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
+        NWorkGroups = NWorkGroupsAlt;
+        WGSize = Rem;
+      }
+    } else {
+      NWorkGroups++;
+    }
+  }
+  return WGSize;
+}
+
+__SYCL_EXPORT size_t
+reduGetMaxWGSize(shared_ptr_class<sycl::detail::queue_impl> Queue,
+                 size_t LocalMemBytesPerWorkItem) {
+  device Dev = Queue->get_device();
+  size_t WGSize = Dev.get_info<info::device::max_work_group_size>();
+  if (LocalMemBytesPerWorkItem != 0) {
+    size_t MemSize = Dev.get_info<info::device::local_mem_size>();
+    size_t WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
+
+    // If the work group size is not pow of two, then an additional element
+    // in local memory is needed for the reduction algorithm and thus the real
+    // work-group size requirement per available memory is stricter.
+    if ((WGSize & (WGSize - 1)) == 0)
+      WGSizePerMem--;
+    WGSize = (std::min)(WGSizePerMem, WGSize);
+  }
+  return WGSize;
+}
+
+} // namespace detail
+} // namespace intel
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp
@@ -87,7 +87,7 @@ int main() {
 
   // Check with various types.
   test<float, 1, std::multiplies<float>>(1, 8, 256);
-  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 1, 16);
   test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   // Check with CUSTOM type.
diff --git a/sycl/test/reduction/reduction_transparent.cpp b/sycl/test/reduction/reduction_transparent.cpp
@@ -113,6 +113,7 @@ int main() {
   test<float, 0, intel::maximum<>>(getMinimumFPValue<float>(), 7, 7 * 5);
   test<signed char, 0, intel::plus<>>(0, 7, 49);
   test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
+  test<unsigned short, 0, intel::plus<>>(0, 1, 512 + 32);
 #endif // __cplusplus >= 201402L
 
   std::cout << "Test passed\n";