@@ -1033,8 +1033,6 @@ class __SYCL_EXPORT handler {
1033
1033
int Dims, typename Reduction>
1034
1034
detail::enable_if_t <!Reduction::has_fast_atomics>
1035
1035
parallel_for (nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
1036
- size_t NWorkGroups = Range.get_group_range ().size ();
1037
-
1038
1036
// This parallel_for() is lowered to the following sequence:
1039
1037
// 1) Call a kernel that a) call user's lambda function and b) performs
1040
1038
// one iteration of reduction, storing the partial reductions/sums
@@ -1048,24 +1046,36 @@ class __SYCL_EXPORT handler {
1048
1046
// 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as
1049
1047
// necessary to reduce all partial sums into one final sum.
1050
1048
1049
+ // Before running the kernels, check that device has enough local memory
1050
+ // to hold local arrays that may be required for the reduction algorithm.
1051
+ // TODO: If the work-group-size is limited by the local memory, then
1052
+ // a special version of the main kernel may be created. The one that would
1053
+ // not use local accessors, which means it would not do the reduction in
1054
+ // the main kernel, but simply generate Range.get_global_range.size() number
1055
+ // of partial sums, leaving the reduction work to the additional/aux
1056
+ // kernels.
1057
+ constexpr bool HFR = Reduction::has_fast_reduce;
1058
+ size_t OneElemSize = HFR ? 0 : sizeof (typename Reduction::result_type);
1059
+ // TODO: currently the maximal work group size is determined for the given
1060
+ // queue/device, while it may be safer to use queries to the kernel compiled
1061
+ // for the device.
1062
+ size_t MaxWGSize = intel::detail::reduGetMaxWGSize (QueueCopy, OneElemSize);
1063
+ assert (MaxWGSize >= Range.get_local_range ().size () &&
1064
+ " This reduction implementation requires more device resources." );
1065
+
1051
1066
// 1. Call the kernel that includes user's lambda function.
1052
1067
intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu);
1053
1068
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
1054
1069
this ->finalize ();
1055
1070
1056
- // 2. Find the maximal work group size usable for the additional kernel(s).
1057
- size_t MaxWGSize;
1058
- if (NWorkGroups > 1 ) {
1059
- constexpr bool HFR = Reduction::has_fast_reduce;
1060
- size_t OneElemSize = HFR ? 0 : sizeof (typename Reduction::result_type);
1061
- MaxWGSize = intel::detail::reduGetMaxWGSize (QueueCopy, OneElemSize);
1062
- assert (MaxWGSize > 1 &&
1063
- " Work group size must be greater than 1 to avoid endless loop." );
1064
- }
1065
-
1066
- // 3. Run the additional kernel as many times as needed to reduce
1071
+ // 2. Run the additional kernel as many times as needed to reduce
1067
1072
// all partial sums into one scalar.
1068
- size_t NWorkItems = NWorkGroups;
1073
+
1074
+ // TODO: Create a special slow/sequential version of the kernel that would
1075
+ // handle the reduction instead of reporting an assert below.
1076
+ assert (MaxWGSize > 1 &&
1077
+ " Work group size must be greater than 1 to avoid endless loop." );
1078
+ size_t NWorkItems = Range.get_group_range ().size ();
1069
1079
while (NWorkItems > 1 ) {
1070
1080
handler AuxHandler (QueueCopy, MIsHost);
1071
1081
AuxHandler.saveCodeLoc (MCodeLoc);
0 commit comments