Skip to content

Commit 2b6f2cd

Browse files
authored
[SYCL][CUDA] Add missing barrier to collectives (#2990)
SYCL sub-group and group functions should act as synchronization points. Group collectives need a barrier at the end to ensure that back-to-back collectives do not lead to a race condition. Note that the barrier at the beginning of each collective occurs after each work-item writes its partial results to the scratch space. This is assumed safe because only the collective functions can access the space, and collective functions must be encountered in uniform control flow; any work-item encountering a collective function can assume it is safe to use the scratch space, because all work-items in the same work-group must have either executed no collective functions or the barrier at the end of a previous collective function. Signed-off-by: John Pennycook [email protected]
1 parent aeb4de7 commit 2b6f2cd

File tree

2 files changed

+71
-0
lines changed

2 files changed

+71
-0
lines changed

libclc/ptx-nvidiacl/libspirv/group/collectives.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ __CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, -DBL_MAX)
260260
result = OP(sg_x, sg_prefix); \
261261
} \
262262
} \
263+
__spirv_ControlBarrier(Workgroup, 0, 0); \
263264
return result; \
264265
}
265266

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %RUN_ON_HOST %t.out
3+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
6+
7+
#include <CL/sycl.hpp>
8+
#include <numeric>
9+
#include <vector>
10+
using namespace cl::sycl;
11+
using namespace cl::sycl::ONEAPI;
12+
13+
class back_to_back;
14+
15+
int main() {
16+
queue q;
17+
if (q.get_device().is_host()) {
18+
std::cout << "Skipping test\n";
19+
return 0;
20+
}
21+
22+
// Use max work-group size to maximize chance of race
23+
program prog(q.get_context());
24+
prog.build_with_kernel_type<back_to_back>();
25+
kernel k = prog.get_kernel<back_to_back>();
26+
device d = q.get_device();
27+
int N = k.get_info<info::kernel_device_specific::work_group_size>(d);
28+
29+
std::vector<int> Input(N), Sum(N), EScan(N), IScan(N);
30+
std::iota(Input.begin(), Input.end(), 0);
31+
std::fill(Sum.begin(), Sum.end(), 0);
32+
std::fill(EScan.begin(), EScan.end(), 0);
33+
std::fill(IScan.begin(), IScan.end(), 0);
34+
35+
{
36+
buffer<int> InputBuf(Input.data(), N);
37+
buffer<int> SumBuf(Sum.data(), N);
38+
buffer<int> EScanBuf(EScan.data(), N);
39+
buffer<int> IScanBuf(IScan.data(), N);
40+
q.submit([&](handler &h) {
41+
auto Input = InputBuf.get_access<access::mode::read>(h);
42+
auto Sum = SumBuf.get_access<access::mode::write>(h);
43+
auto EScan = EScanBuf.get_access<access::mode::write>(h);
44+
auto IScan = IScanBuf.get_access<access::mode::write>(h);
45+
h.parallel_for<back_to_back>(nd_range<1>(N, N), [=](nd_item<1> it) {
46+
size_t i = it.get_global_id(0);
47+
auto g = it.get_group();
48+
// Loop to increase number of back-to-back calls
49+
for (int r = 0; r < 10; ++r) {
50+
Sum[i] = reduce(g, Input[i], plus<>());
51+
EScan[i] = exclusive_scan(g, Input[i], plus<>());
52+
IScan[i] = inclusive_scan(g, Input[i], plus<>());
53+
}
54+
});
55+
});
56+
}
57+
58+
int sum = 0;
59+
bool passed = true;
60+
for (int i = 0; i < N; ++i) {
61+
passed &= (sum == EScan[i]);
62+
sum += i;
63+
passed &= (sum == IScan[i]);
64+
}
65+
for (int i = 0; i < N; ++i) {
66+
passed &= (sum == Sum[i]);
67+
}
68+
std::cout << "Test passed." << std::endl;
69+
return 0;
70+
}

0 commit comments

Comments
 (0)