Skip to content

Commit e6b6ae7

Browse files
authored
[SYCL] Fix endless-loop in reduction with 1 element local range (#2022)
The reduction implementation for the data types not having fast atomics may require running an additional kernel as many times as needed to converge all partial sums into the last one scalar sum, which possible only when the work-group size is greater than 1. The additional kernel used work-group size specified in the original user's kernel, which is not necessary, and causes endless loop when local range has only 1 element. The patch checks the max available work-group size on the device, it also checks the local memory available and chooses the work-group size for the additional kernels, which eliminates the endless loop and makes the converge process faster as bigger work-group size is chosen. Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 84bf234 commit e6b6ae7

File tree

7 files changed

+159
-61
lines changed

7 files changed

+159
-61
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 41 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ template <typename T, class BinaryOperation, int Dims, bool IsUSM,
179179
class reduction_impl;
180180

181181
using cl::sycl::detail::enable_if_t;
182+
using cl::sycl::detail::queue_impl;
182183

183184
template <typename KernelName, typename KernelType, int Dims, class Reduction,
184185
typename OutputT>
@@ -191,10 +192,14 @@ enable_if_t<!Reduction::has_fast_atomics>
191192
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
192193
Reduction &Redu);
193194

194-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
195-
enable_if_t<!Reduction::has_fast_atomics>
196-
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
195+
template <typename KernelName, typename KernelType, class Reduction>
196+
enable_if_t<!Reduction::has_fast_atomics, size_t>
197+
reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
197198
Reduction &Redu);
199+
200+
__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
201+
size_t LocalMemBytesPerWorkItem);
202+
198203
} // namespace detail
199204
} // namespace intel
200205

@@ -1036,8 +1041,6 @@ class __SYCL_EXPORT handler {
10361041
int Dims, typename Reduction>
10371042
detail::enable_if_t<!Reduction::has_fast_atomics>
10381043
parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
1039-
size_t NWorkGroups = Range.get_group_range().size();
1040-
10411044
// This parallel_for() is lowered to the following sequence:
10421045
// 1) Call a kernel that a) call user's lambda function and b) performs
10431046
// one iteration of reduction, storing the partial reductions/sums
@@ -1051,42 +1054,51 @@ class __SYCL_EXPORT handler {
10511054
// 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as
10521055
// necessary to reduce all partial sums into one final sum.
10531056

1057+
// Before running the kernels, check that device has enough local memory
1058+
// to hold local arrays that may be required for the reduction algorithm.
1059+
// TODO: If the work-group-size is limited by the local memory, then
1060+
// a special version of the main kernel may be created. The one that would
1061+
// not use local accessors, which means it would not do the reduction in
1062+
// the main kernel, but simply generate Range.get_global_range.size() number
1063+
// of partial sums, leaving the reduction work to the additional/aux
1064+
// kernels.
1065+
constexpr bool HFR = Reduction::has_fast_reduce;
1066+
size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1067+
// TODO: currently the maximal work group size is determined for the given
1068+
// queue/device, while it may be safer to use queries to the kernel compiled
1069+
// for the device.
1070+
size_t MaxWGSize = intel::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1071+
if (Range.get_local_range().size() > MaxWGSize)
1072+
throw sycl::runtime_error("The implementation handling parallel_for with"
1073+
" reduction requires smaller work group size.",
1074+
PI_INVALID_WORK_GROUP_SIZE);
1075+
10541076
// 1. Call the kernel that includes user's lambda function.
10551077
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
10561078
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
10571079
this->finalize();
10581080

1059-
// 2. Run the additional aux kernel as many times as needed to reduce
1081+
// 2. Run the additional kernel as many times as needed to reduce
10601082
// all partial sums into one scalar.
10611083

1062-
// TODO: user's nd_range and the work-group size specified there must
1063-
// be honored only for the main kernel that calls user's lambda functions.
1064-
// There is no need in using the same work-group size in these additional
1065-
// kernels. Thus, the better strategy here is to make the work-group size
1066-
// as big as possible to converge/reduce the partial sums into the last
1067-
// sum faster.
1068-
size_t WGSize = Range.get_local_range().size();
1069-
size_t NWorkItems = NWorkGroups;
1084+
// TODO: Create a special slow/sequential version of the kernel that would
1085+
// handle the reduction instead of reporting an assert below.
1086+
if (MaxWGSize <= 1)
1087+
throw sycl::runtime_error("The implementation handling parallel_for with "
1088+
"reduction requires the maximal work group "
1089+
"size to be greater than 1 to converge. "
1090+
"The maximal work group size depends on the "
1091+
"device and the size of the objects passed to "
1092+
"the reduction.",
1093+
PI_INVALID_WORK_GROUP_SIZE);
1094+
size_t NWorkItems = Range.get_group_range().size();
10701095
while (NWorkItems > 1) {
1071-
WGSize = std::min(WGSize, NWorkItems);
1072-
NWorkGroups = NWorkItems / WGSize;
1073-
// The last group may be not fully loaded. Still register it as a group.
1074-
if ((NWorkItems % WGSize) != 0)
1075-
++NWorkGroups;
1076-
nd_range<1> Range(range<1>(WGSize * NWorkGroups), range<1>(WGSize));
1077-
10781096
handler AuxHandler(QueueCopy, MIsHost);
10791097
AuxHandler.saveCodeLoc(MCodeLoc);
10801098

1081-
// The last kernel DOES write to user's accessor passed to reduction.
1082-
// Associate it with handler manually.
1083-
if (NWorkGroups == 1 && !Reduction::is_usm)
1084-
Redu.associateWithHandler(AuxHandler);
1085-
intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
1086-
NWorkItems, Redu);
1099+
NWorkItems = intel::detail::reduAuxCGFunc<KernelName, KernelType>(
1100+
AuxHandler, NWorkItems, MaxWGSize, Redu);
10871101
MLastEvent = AuxHandler.finalize();
1088-
1089-
NWorkItems = NWorkGroups;
10901102
} // end while (NWorkItems > 1)
10911103
}
10921104

sycl/include/CL/sycl/intel/reduction.hpp

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#pragma once
1010

1111
#include <CL/sycl/accessor.hpp>
12+
#include <CL/sycl/handler.hpp>
1213
#include <CL/sycl/intel/group_algorithm.hpp>
1314

1415
__SYCL_INLINE_NAMESPACE(cl) {
@@ -17,6 +18,11 @@ namespace intel {
1718

1819
namespace detail {
1920

21+
__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
22+
size_t LocalMemBytesPerWorkItem);
23+
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
24+
size_t &NWorkGroups);
25+
2026
using cl::sycl::detail::bool_constant;
2127
using cl::sycl::detail::enable_if_t;
2228
using cl::sycl::detail::is_geninteger16bit;
@@ -867,19 +873,19 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
867873
/// of work-groups. At the end of each work-groups the partial sum is written
868874
/// to a global buffer.
869875
///
870-
/// Briefly: aux kernel, intel:reduce(), reproducible results,FP + ADD/MIN/MAX
871-
template <typename KernelName, typename KernelType, int Dims, class Reduction,
872-
bool UniformWG, typename InputT, typename OutputT>
876+
/// Briefly: aux kernel, intel:reduce(), reproducible results, FP + ADD/MIN/MAX
877+
template <typename KernelName, typename KernelType, bool UniformWG,
878+
class Reduction, typename InputT, typename OutputT>
873879
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
874-
reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
875-
Reduction &, InputT In, OutputT Out) {
876-
size_t NWorkGroups = Range.get_group_range().size();
877-
bool IsUpdateOfUserVar =
878-
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
879-
880+
reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
881+
size_t WGSize, Reduction &, InputT In, OutputT Out) {
880882
using Name = typename get_reduction_aux_kernel_name_t<
881883
KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
882-
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
884+
885+
bool IsUpdateOfUserVar =
886+
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
887+
nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
888+
CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
883889
typename Reduction::binary_operation BOp;
884890
size_t WGID = NDIt.get_group_linear_id();
885891
size_t GID = NDIt.get_global_linear_id();
@@ -903,14 +909,11 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
903909
/// to a global buffer.
904910
///
905911
/// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
906-
template <typename KernelName, typename KernelType, int Dims, class Reduction,
907-
bool UniformPow2WG, typename InputT, typename OutputT>
912+
template <typename KernelName, typename KernelType, bool UniformPow2WG,
913+
class Reduction, typename InputT, typename OutputT>
908914
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
909-
reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
910-
Reduction &Redu, InputT In, OutputT Out) {
911-
size_t WGSize = Range.get_local_range().size();
912-
size_t NWorkGroups = Range.get_group_range().size();
913-
915+
reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
916+
size_t WGSize, Reduction &Redu, InputT In, OutputT Out) {
914917
bool IsUpdateOfUserVar =
915918
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
916919

@@ -924,7 +927,8 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
924927
auto ReduIdentity = Redu.getIdentity();
925928
using Name = typename get_reduction_aux_kernel_name_t<
926929
KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
927-
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
930+
nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
931+
CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
928932
size_t WGSize = NDIt.get_local_range().size();
929933
size_t LID = NDIt.get_local_linear_id();
930934
size_t GID = NDIt.get_global_linear_id();
@@ -962,12 +966,22 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
962966
});
963967
}
964968

965-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
966-
enable_if_t<!Reduction::has_fast_atomics>
967-
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
969+
/// Implements a command group function that enqueues a kernel that does one
970+
/// iteration of reduction of elements in each of work-groups.
971+
/// At the end of each work-group the partial sum is written to a global buffer.
972+
/// The function returns the number of the newly generated partial sums.
973+
template <typename KernelName, typename KernelType, class Reduction>
974+
enable_if_t<!Reduction::has_fast_atomics, size_t>
975+
reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
968976
Reduction &Redu) {
969-
size_t WGSize = Range.get_local_range().size();
970-
size_t NWorkGroups = Range.get_group_range().size();
977+
978+
size_t NWorkGroups;
979+
size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
980+
981+
// The last kernel DOES write to user's accessor passed to reduction.
982+
// Associate it with handler manually.
983+
if (NWorkGroups == 1 && !Reduction::is_usm)
984+
Redu.associateWithHandler(CGH);
971985

972986
// The last work-group may be not fully loaded with work, or the work group
973987
// size may be not power of two. Those two cases considered inefficient
@@ -981,20 +995,21 @@ reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
981995
auto In = Redu.getReadAccToPreviousPartialReds(CGH);
982996
if (Reduction::is_usm && NWorkGroups == 1) {
983997
if (HasUniformWG)
984-
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
985-
CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
998+
reduAuxCGFuncImpl<KernelName, KernelType, true>(
999+
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
9861000
else
987-
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
988-
CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
1001+
reduAuxCGFuncImpl<KernelName, KernelType, false>(
1002+
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
9891003
} else {
9901004
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
9911005
if (HasUniformWG)
992-
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
993-
CGH, Range, NWorkItems, Redu, In, Out);
1006+
reduAuxCGFuncImpl<KernelName, KernelType, true>(
1007+
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
9941008
else
995-
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
996-
CGH, Range, NWorkItems, Redu, In, Out);
1009+
reduAuxCGFuncImpl<KernelName, KernelType, false>(
1010+
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
9971011
}
1012+
return NWorkGroups;
9981013
}
9991014

10001015
} // namespace detail

sycl/source/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ set(SYCL_SOURCES
125125
"detail/queue_impl.cpp"
126126
"detail/os_util.cpp"
127127
"detail/platform_util.cpp"
128+
"detail/reduction.cpp"
128129
"detail/sampler_impl.cpp"
129130
"detail/stream_impl.cpp"
130131
"detail/scheduler/commands.cpp"

sycl/source/detail/reduction.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
//==---------------- reduction.cpp - SYCL reduction ------------*- C++ -*---==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <CL/sycl/intel/reduction.hpp>
10+
#include <detail/queue_impl.hpp>
11+
12+
__SYCL_INLINE_NAMESPACE(cl) {
13+
namespace sycl {
14+
namespace intel {
15+
namespace detail {
16+
17+
// TODO: The algorithm of choosing the work-group size is definitely
18+
// imperfect now and can be improved.
19+
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
20+
size_t &NWorkGroups) {
21+
size_t WGSize = MaxWGSize;
22+
if (NWorkItems <= WGSize) {
23+
NWorkGroups = 1;
24+
WGSize = NWorkItems;
25+
} else {
26+
NWorkGroups = NWorkItems / WGSize;
27+
size_t Rem = NWorkItems % WGSize;
28+
if (Rem != 0) {
29+
// Let's say MaxWGSize = 128 and NWorkItems is (128+32).
30+
// It seems better to have 5 groups 32 work-items each than 2 groups with
31+
// 128 work-items in the 1st group and 32 work-items in the 2nd group.
32+
size_t NWorkGroupsAlt = NWorkItems / Rem;
33+
size_t RemAlt = NWorkItems % Rem;
34+
if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
35+
NWorkGroups = NWorkGroupsAlt;
36+
WGSize = Rem;
37+
}
38+
} else {
39+
NWorkGroups++;
40+
}
41+
}
42+
return WGSize;
43+
}
44+
45+
__SYCL_EXPORT size_t
46+
reduGetMaxWGSize(shared_ptr_class<sycl::detail::queue_impl> Queue,
47+
size_t LocalMemBytesPerWorkItem) {
48+
device Dev = Queue->get_device();
49+
size_t WGSize = Dev.get_info<info::device::max_work_group_size>();
50+
if (LocalMemBytesPerWorkItem != 0) {
51+
size_t MemSize = Dev.get_info<info::device::local_mem_size>();
52+
size_t WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
53+
54+
// If the work group size is not pow of two, then an additional element
55+
// in local memory is needed for the reduction algorithm and thus the real
56+
// work-group size requirement per available memory is stricter.
57+
if ((WGSize & (WGSize - 1)) == 0)
58+
WGSizePerMem--;
59+
WGSize = (std::min)(WGSizePerMem, WGSize);
60+
}
61+
return WGSize;
62+
}
63+
64+
} // namespace detail
65+
} // namespace intel
66+
} // namespace sycl
67+
} // __SYCL_INLINE_NAMESPACE(cl)

sycl/test/abi/sycl_symbols_linux.dump

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3621,6 +3621,8 @@ _ZN2cl4sycl5eventC1Ev
36213621
_ZN2cl4sycl5eventC2EP9_cl_eventRKNS0_7contextE
36223622
_ZN2cl4sycl5eventC2ESt10shared_ptrINS0_6detail10event_implEE
36233623
_ZN2cl4sycl5eventC2Ev
3624+
_ZN2cl4sycl5intel6detail17reduComputeWGSizeEmmRm
3625+
_ZN2cl4sycl5intel6detail16reduGetMaxWGSizeESt10shared_ptrINS0_6detail10queue_implEEm
36243626
_ZN2cl4sycl5queue10mem_adviseEPKvm14_pi_mem_advice
36253627
_ZN2cl4sycl5queue10wait_proxyERKNS0_6detail13code_locationE
36263628
_ZN2cl4sycl5queue11submit_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationE

sycl/test/reduction/reduction_nd_s1_rw.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ int main() {
8787

8888
// Check with various types.
8989
test<float, 1, std::multiplies<float>>(1, 8, 256);
90-
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
90+
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 1, 16);
9191
test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
9292

9393
// Check with CUSTOM type.

sycl/test/reduction/reduction_transparent.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ int main() {
113113
test<float, 0, intel::maximum<>>(getMinimumFPValue<float>(), 7, 7 * 5);
114114
test<signed char, 0, intel::plus<>>(0, 7, 49);
115115
test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
116+
test<unsigned short, 0, intel::plus<>>(0, 1, 512 + 32);
116117
#endif // __cplusplus >= 201402L
117118

118119
std::cout << "Test passed\n";

0 commit comments

Comments
 (0)