Skip to content

Commit d5620dd

Browse files
committed
[SYCL] Fix endless-loop in reduction with nd_range having 1 element local range
The reduction implementation for the data types not having fast atomics may require running an additional kernel as many times as needed to converge all partial sums into the last one scalar sum, which possible only when the work-group size is greater than 1. The additional kernel used work-group size specified in the original user's kernel, which is not necessary, and causes endless loop when local range has only 1 element. The patch checks the max available work-group size on the device, it also checks the local memory available and chooses the work-group size for the additional kernels, which eliminates the endless loop and makes the converge process faster as bigger work-group size is chosen. Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 7d6ede4 commit d5620dd

File tree

6 files changed

+137
-59
lines changed

6 files changed

+137
-59
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ template <typename T, class BinaryOperation, int Dims, bool IsUSM,
179179
class reduction_impl;
180180

181181
using cl::sycl::detail::enable_if_t;
182+
using cl::sycl::detail::queue_impl;
182183

183184
template <typename KernelName, typename KernelType, int Dims, class Reduction,
184185
typename OutputT>
@@ -191,10 +192,14 @@ enable_if_t<!Reduction::has_fast_atomics>
191192
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
192193
Reduction &Redu);
193194

194-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
195-
enable_if_t<!Reduction::has_fast_atomics>
196-
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
195+
template <typename KernelName, typename KernelType, class Reduction>
196+
enable_if_t<!Reduction::has_fast_atomics, size_t>
197+
reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
197198
Reduction &Redu);
199+
200+
__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
201+
size_t LocalMemBytesPerWorkItem);
202+
198203
} // namespace detail
199204
} // namespace intel
200205

@@ -1048,37 +1053,26 @@ class __SYCL_EXPORT handler {
10481053
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
10491054
this->finalize();
10501055

1051-
// 2. Run the additional aux kernel as many times as needed to reduce
1052-
// all partial sums into one scalar.
1056+
// 2. Find the maximal work group size usable for the additional kernel(s).
1057+
size_t MaxWGSize;
1058+
if (NWorkGroups > 1) {
1059+
constexpr bool HFR = Reduction::has_fast_reduce;
1060+
size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1061+
MaxWGSize = intel::detail::reduGetMaxWGSize(QueueCopy, OneElemSize);
1062+
assert(MaxWGSize > 1 &&
1063+
"Work group size must be greater than 1 to avoid endless loop.");
1064+
}
10531065

1054-
// TODO: user's nd_range and the work-group size specified there must
1055-
// be honored only for the main kernel that calls user's lambda functions.
1056-
// There is no need in using the same work-group size in these additional
1057-
// kernels. Thus, the better strategy here is to make the work-group size
1058-
// as big as possible to converge/reduce the partial sums into the last
1059-
// sum faster.
1060-
size_t WGSize = Range.get_local_range().size();
1066+
// 3. Run the additional kernel as many times as needed to reduce
1067+
// all partial sums into one scalar.
10611068
size_t NWorkItems = NWorkGroups;
10621069
while (NWorkItems > 1) {
1063-
WGSize = std::min(WGSize, NWorkItems);
1064-
NWorkGroups = NWorkItems / WGSize;
1065-
// The last group may be not fully loaded. Still register it as a group.
1066-
if ((NWorkItems % WGSize) != 0)
1067-
++NWorkGroups;
1068-
nd_range<1> Range(range<1>(WGSize * NWorkGroups), range<1>(WGSize));
1069-
10701070
handler AuxHandler(QueueCopy, MIsHost);
10711071
AuxHandler.saveCodeLoc(MCodeLoc);
10721072

1073-
// The last kernel DOES write to user's accessor passed to reduction.
1074-
// Associate it with handler manually.
1075-
if (NWorkGroups == 1 && !Reduction::is_usm)
1076-
Redu.associateWithHandler(AuxHandler);
1077-
intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
1078-
NWorkItems, Redu);
1073+
NWorkItems = intel::detail::reduAuxCGFunc<KernelName, KernelType>(
1074+
AuxHandler, NWorkItems, MaxWGSize, Redu);
10791075
MLastEvent = AuxHandler.finalize();
1080-
1081-
NWorkItems = NWorkGroups;
10821076
} // end while (NWorkItems > 1)
10831077
}
10841078

sycl/include/CL/sycl/intel/reduction.hpp

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#pragma once
1010

1111
#include <CL/sycl/accessor.hpp>
12+
#include <CL/sycl/handler.hpp>
1213
#include <CL/sycl/intel/group_algorithm.hpp>
1314

1415
__SYCL_INLINE_NAMESPACE(cl) {
@@ -17,6 +18,11 @@ namespace intel {
1718

1819
namespace detail {
1920

21+
__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
22+
size_t LocalMemBytesPerWorkItem);
23+
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
24+
size_t &NWorkGroups);
25+
2026
using cl::sycl::detail::bool_constant;
2127
using cl::sycl::detail::enable_if_t;
2228
using cl::sycl::detail::is_geninteger16bit;
@@ -867,19 +873,19 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
867873
/// of work-groups. At the end of each work-groups the partial sum is written
868874
/// to a global buffer.
869875
///
870-
/// Briefly: aux kernel, intel:reduce(), reproducible results,FP + ADD/MIN/MAX
871-
template <typename KernelName, typename KernelType, int Dims, class Reduction,
872-
bool UniformWG, typename InputT, typename OutputT>
876+
/// Briefly: aux kernel, intel:reduce(), reproducible results, FP + ADD/MIN/MAX
877+
template <typename KernelName, typename KernelType, bool UniformWG,
878+
class Reduction, typename InputT, typename OutputT>
873879
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
874-
reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
875-
Reduction &, InputT In, OutputT Out) {
876-
size_t NWorkGroups = Range.get_group_range().size();
877-
bool IsUpdateOfUserVar =
878-
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
879-
880+
reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
881+
size_t WGSize, Reduction &, InputT In, OutputT Out) {
880882
using Name = typename get_reduction_aux_kernel_name_t<
881883
KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
882-
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
884+
885+
bool IsUpdateOfUserVar =
886+
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
887+
nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
888+
CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
883889
typename Reduction::binary_operation BOp;
884890
size_t WGID = NDIt.get_group_linear_id();
885891
size_t GID = NDIt.get_global_linear_id();
@@ -903,14 +909,11 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
903909
/// to a global buffer.
904910
///
905911
/// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
906-
template <typename KernelName, typename KernelType, int Dims, class Reduction,
907-
bool UniformPow2WG, typename InputT, typename OutputT>
912+
template <typename KernelName, typename KernelType, bool UniformPow2WG,
913+
class Reduction, typename InputT, typename OutputT>
908914
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
909-
reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
910-
Reduction &Redu, InputT In, OutputT Out) {
911-
size_t WGSize = Range.get_local_range().size();
912-
size_t NWorkGroups = Range.get_group_range().size();
913-
915+
reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
916+
size_t WGSize, Reduction &Redu, InputT In, OutputT Out) {
914917
bool IsUpdateOfUserVar =
915918
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
916919

@@ -924,7 +927,8 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
924927
auto ReduIdentity = Redu.getIdentity();
925928
using Name = typename get_reduction_aux_kernel_name_t<
926929
KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
927-
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
930+
nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
931+
CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
928932
size_t WGSize = NDIt.get_local_range().size();
929933
size_t LID = NDIt.get_local_linear_id();
930934
size_t GID = NDIt.get_global_linear_id();
@@ -962,12 +966,22 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
962966
});
963967
}
964968

965-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
966-
enable_if_t<!Reduction::has_fast_atomics>
967-
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
969+
/// Implements a command group function that enqueues a kernel that does one
970+
/// iteration of reduction of elements in each of work-groups.
971+
/// At the end of each work-group the partial sum is written to a global buffer.
972+
/// The function returns the number of the newly generated partial sums.
973+
template <typename KernelName, typename KernelType, class Reduction>
974+
enable_if_t<!Reduction::has_fast_atomics, size_t>
975+
reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
968976
Reduction &Redu) {
969-
size_t WGSize = Range.get_local_range().size();
970-
size_t NWorkGroups = Range.get_group_range().size();
977+
978+
size_t NWorkGroups;
979+
size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
980+
981+
// The last kernel DOES write to user's accessor passed to reduction.
982+
// Associate it with handler manually.
983+
if (NWorkGroups == 1 && !Reduction::is_usm)
984+
Redu.associateWithHandler(CGH);
971985

972986
// The last work-group may be not fully loaded with work, or the work group
973987
// size may be not power of two. Those two cases considered inefficient
@@ -981,20 +995,21 @@ reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
981995
auto In = Redu.getReadAccToPreviousPartialReds(CGH);
982996
if (Reduction::is_usm && NWorkGroups == 1) {
983997
if (HasUniformWG)
984-
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
985-
CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
998+
reduAuxCGFuncImpl<KernelName, KernelType, true>(
999+
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
9861000
else
987-
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
988-
CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
1001+
reduAuxCGFuncImpl<KernelName, KernelType, false>(
1002+
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
9891003
} else {
9901004
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
9911005
if (HasUniformWG)
992-
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
993-
CGH, Range, NWorkItems, Redu, In, Out);
1006+
reduAuxCGFuncImpl<KernelName, KernelType, true>(
1007+
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
9941008
else
995-
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
996-
CGH, Range, NWorkItems, Redu, In, Out);
1009+
reduAuxCGFuncImpl<KernelName, KernelType, false>(
1010+
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
9971011
}
1012+
return NWorkGroups;
9981013
}
9991014

10001015
} // namespace detail

sycl/source/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ set(SYCL_SOURCES
124124
"detail/queue_impl.cpp"
125125
"detail/os_util.cpp"
126126
"detail/platform_util.cpp"
127+
"detail/reduction.cpp"
127128
"detail/sampler_impl.cpp"
128129
"detail/stream_impl.cpp"
129130
"detail/scheduler/commands.cpp"

sycl/source/detail/reduction.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
//==---------------- reduction.cpp - SYCL reduction ------------*- C++ -*---==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <CL/sycl/intel/reduction.hpp>
10+
#include <detail/queue_impl.hpp>
11+
12+
__SYCL_INLINE_NAMESPACE(cl) {
13+
namespace sycl {
14+
namespace intel {
15+
namespace detail {
16+
17+
// TODO: The algorithm of choosing the work-group size is definitely
18+
// imperfect now and can be improved.
19+
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
20+
size_t &NWorkGroups) {
21+
size_t WGSize = MaxWGSize;
22+
if (NWorkItems <= WGSize) {
23+
NWorkGroups = 1;
24+
WGSize = NWorkItems;
25+
} else {
26+
NWorkGroups = NWorkItems / WGSize;
27+
size_t Rem = NWorkItems % WGSize;
28+
if (Rem != 0) {
29+
// Let's say MaxWGSize = 128 and NWorkItems is (128+32).
30+
// It seems better to have 5 groups 32 work-items each than 2 groups with
31+
// 128 work-items in the 1st group and 32 work-items in the 2nd group.
32+
size_t NWorkGroupsAlt = NWorkItems / Rem;
33+
size_t RemAlt = NWorkItems % Rem;
34+
if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
35+
NWorkGroups = NWorkGroupsAlt;
36+
WGSize = Rem;
37+
}
38+
} else {
39+
NWorkGroups++;
40+
}
41+
}
42+
return WGSize;
43+
}
44+
45+
__SYCL_EXPORT size_t
46+
reduGetMaxWGSize(shared_ptr_class<sycl::detail::queue_impl> Queue,
47+
size_t LocalMemBytesPerWorkItem) {
48+
device Dev = Queue->get_device();
49+
size_t WGSize = Dev.get_info<info::device::max_work_group_size>();
50+
if (LocalMemBytesPerWorkItem != 0) {
51+
size_t MemSize = Dev.get_info<info::device::local_mem_size>();
52+
size_t WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
53+
54+
// If the work group size is not pow of two, then an additional element
55+
// in local memory is needed for the reduction algorithm and thus the real
56+
// work-group size requirement per available memory is stricter.
57+
if ((WGSize & (WGSize - 1)) == 0)
58+
WGSizePerMem--;
59+
WGSize = (std::min)(WGSizePerMem, WGSize);
60+
}
61+
return WGSize;
62+
}
63+
64+
} // namespace detail
65+
} // namespace intel
66+
} // namespace sycl
67+
} // __SYCL_INLINE_NAMESPACE(cl)

sycl/test/reduction/reduction_nd_s1_rw.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ int main() {
8787

8888
// Check with various types.
8989
test<float, 1, std::multiplies<float>>(1, 8, 256);
90-
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
90+
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 1, 16);
9191
test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
9292

9393
// Check with CUSTOM type.

sycl/test/reduction/reduction_transparent.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ int main() {
113113
test<float, 0, intel::maximum<>>(getMinimumFPValue<float>(), 7, 7 * 5);
114114
test<signed char, 0, intel::plus<>>(0, 7, 49);
115115
test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
116+
test<unsigned short, 0, intel::plus<>>(0, 1, 512 + 32);
116117
#endif // __cplusplus >= 201402L
117118

118119
std::cout << "Test passed\n";

0 commit comments

Comments
 (0)