Skip to content

Commit f9137f4

Browse files
committed
Check the max work group size for the main kernel, not only for additional/aux kernel
Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 82a8b8b commit f9137f4

File tree

1 file changed

+24
-14
lines changed

1 file changed

+24
-14
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1033,8 +1033,6 @@ class __SYCL_EXPORT handler {
10331033
int Dims, typename Reduction>
10341034
detail::enable_if_t<!Reduction::has_fast_atomics>
10351035
parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
1036-
size_t NWorkGroups = Range.get_group_range().size();
1037-
10381036
// This parallel_for() is lowered to the following sequence:
10391037
// 1) Call a kernel that a) call user's lambda function and b) performs
10401038
// one iteration of reduction, storing the partial reductions/sums
@@ -1048,24 +1046,36 @@ class __SYCL_EXPORT handler {
10481046
// 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as
10491047
// necessary to reduce all partial sums into one final sum.
10501048

1049+
// Before running the kernels, check that device has enough local memory
1050+
// to hold local arrays that may be required for the reduction algorithm.
1051+
// TODO: If the work-group-size is limited by the local memory, then
1052+
// a special version of the main kernel may be created. The one that would
1053+
// not use local accessors, which means it would not do the reduction in
1054+
// the main kernel, but simply generate Range.get_global_range.size() number
1055+
// of partial sums, leaving the reduction work to the additional/aux
1056+
// kernels.
1057+
constexpr bool HFR = Reduction::has_fast_reduce;
1058+
size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1059+
// TODO: currently the maximal work group size is determined for the given
1060+
// queue/device, while it may be safer to use queries to the kernel compiled
1061+
// for the device.
1062+
size_t MaxWGSize = intel::detail::reduGetMaxWGSize(QueueCopy, OneElemSize);
1063+
assert(MaxWGSize >= Range.get_local_range().size() &&
1064+
"This reduction implementation requires more device resources.");
1065+
10511066
// 1. Call the kernel that includes user's lambda function.
10521067
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
10531068
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
10541069
this->finalize();
10551070

1056-
// 2. Find the maximal work group size usable for the additional kernel(s).
1057-
size_t MaxWGSize;
1058-
if (NWorkGroups > 1) {
1059-
constexpr bool HFR = Reduction::has_fast_reduce;
1060-
size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
1061-
MaxWGSize = intel::detail::reduGetMaxWGSize(QueueCopy, OneElemSize);
1062-
assert(MaxWGSize > 1 &&
1063-
"Work group size must be greater than 1 to avoid endless loop.");
1064-
}
1065-
1066-
// 3. Run the additional kernel as many times as needed to reduce
1071+
// 2. Run the additional kernel as many times as needed to reduce
10671072
// all partial sums into one scalar.
1068-
size_t NWorkItems = NWorkGroups;
1073+
1074+
// TODO: Create a special slow/sequential version of the kernel that would
1075+
// handle the reduction instead of reporting an assert below.
1076+
assert(MaxWGSize > 1 &&
1077+
"Work group size must be greater than 1 to avoid endless loop.");
1078+
size_t NWorkItems = Range.get_group_range().size();
10691079
while (NWorkItems > 1) {
10701080
handler AuxHandler(QueueCopy, MIsHost);
10711081
AuxHandler.saveCodeLoc(MCodeLoc);

0 commit comments

Comments
 (0)