Skip to content

Commit 2aef0c9

Browse files
[SYCL][Reduction] Fix last group detection's atomic memory_order (#8058)
It needs to be at least `memory_order::acq_rel` because we need updates to the partial sums to be visible to the last finished thread. This was caught on an existing test from the intel/llvm-test-suite that we've run internally on a new configuration, hence no complementary test suite change.
1 parent 3d8a136 commit 2aef0c9

File tree

1 file changed

+7
-2
lines changed

1 file changed

+7
-2
lines changed

sycl/include/sycl/reduction.hpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -989,7 +989,7 @@ struct NDRangeReduction<
989989
// Signal this work-group has finished after all values are reduced
990990
if (LID == 0) {
991991
auto NFinished =
992-
sycl::atomic_ref<int, memory_order::relaxed, memory_scope::device,
992+
sycl::atomic_ref<int, memory_order::acq_rel, memory_scope::device,
993993
access::address_space::global_space>(
994994
NWorkGroupsFinished[0]);
995995
DoReducePartialSumsInLastWG[0] = ++NFinished == NWorkGroups;
@@ -1107,7 +1107,7 @@ template <> struct NDRangeReduction<reduction::strategy::range_basic> {
11071107
// Signal this work-group has finished after all values are reduced
11081108
if (LID == 0) {
11091109
auto NFinished =
1110-
sycl::atomic_ref<int, memory_order::relaxed, memory_scope::device,
1110+
sycl::atomic_ref<int, memory_order::acq_rel, memory_scope::device,
11111111
access::address_space::global_space>(
11121112
NWorkGroupsFinished[0]);
11131113
DoReducePartialSumsInLastWG[0] =
@@ -2321,6 +2321,11 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
23212321
if constexpr (Strategy != reduction::strategy::auto_select)
23222322
return Strategy;
23232323

2324+
// TODO: Both group_reduce_and_last_wg_detection and range_basic require
2325+
// memory_order::acq_rel support that isn't guaranteed by the
2326+
// specification. However, implementing run-time check for that would
2327+
// result in an extra kernel compilation(s). We probably need to
2328+
// investigate if the usage of kernel_bundles can mitigate that.
23242329
if constexpr (Reduction::has_fast_reduce)
23252330
return reduction::strategy::group_reduce_and_last_wg_detection;
23262331
else if constexpr (Reduction::has_fast_atomics)

0 commit comments

Comments
 (0)