Skip to content

Commit dc21cce

Browse files
t4c1bb-sycl
authored andcommitted
[SYCL][CUDA] Add tests for asynchronous barrier (intel#737)
Adds tests for intel/llvm#5303
1 parent 30e08cf commit dc21cce

File tree

1 file changed

+157
-0
lines changed

1 file changed

+157
-0
lines changed

SYCL/GroupAlgorithm/barrier.cpp

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80
2+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
3+
4+
// REQUIRES: cuda
5+
6+
#include "CL/sycl.hpp"
7+
#include <iostream>
8+
#include <vector>
9+
10+
using namespace sycl;
11+
using namespace sycl::ext::oneapi::experimental::cuda;
12+
13+
void basic() {
14+
queue q{};
15+
int N = 64;
16+
std::vector<int> data(N);
17+
for (int i = 0; i < N; i++) {
18+
data[i] = i;
19+
}
20+
{
21+
buffer<int> buf(data.data(), N);
22+
23+
q.submit([&](handler &cgh) {
24+
auto acc = buf.get_access<access::mode::read_write>(cgh);
25+
accessor<int, 1, access::mode::read_write, access::target::local> loc(
26+
N, cgh);
27+
accessor<barrier, 1, access::mode::read_write, access::target::local>
28+
loc_barrier(2, cgh);
29+
cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> item) {
30+
size_t idx = item.get_local_linear_id();
31+
loc[idx] = acc[idx];
32+
if (idx < 2) {
33+
loc_barrier[idx].initialize(N);
34+
}
35+
item.barrier(access::fence_space::local_space);
36+
for (int i = 0; i < N; i++) {
37+
int val = loc[idx];
38+
barrier::arrival_token arr = loc_barrier[0].arrive();
39+
val += 1;
40+
int dst_idx = (idx + 1) % N;
41+
loc_barrier[0].wait(arr);
42+
loc[dst_idx] = val;
43+
loc_barrier[1].wait(loc_barrier[1].arrive());
44+
}
45+
acc[idx] = loc[idx];
46+
});
47+
});
48+
}
49+
for (int i = 0; i < N; i++) {
50+
assert(data[i] == i + N);
51+
}
52+
}
53+
54+
void interface() {
55+
queue q{};
56+
int N = 64;
57+
std::vector<int> data(N, -1);
58+
std::vector<int> test1(N, -1);
59+
std::vector<int> test2(N, -1);
60+
for (int i = 0; i < N; i++) {
61+
data[i] = i;
62+
}
63+
{
64+
buffer<int> data_buf(data.data(), N);
65+
buffer<int> test1_buf(test1.data(), N);
66+
buffer<int> test2_buf(test2.data(), N);
67+
68+
q.submit([&](handler &cgh) {
69+
auto data_acc = data_buf.get_access<access::mode::read_write>(cgh);
70+
auto test1_acc = test1_buf.get_access<access::mode::read_write>(cgh);
71+
auto test2_acc = test2_buf.get_access<access::mode::read_write>(cgh);
72+
accessor<int, 1, access::mode::read_write, access::target::local> loc(
73+
N, cgh);
74+
accessor<barrier, 1, access::mode::read_write, access::target::local>
75+
loc_barrier(2, cgh);
76+
cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> item) {
77+
size_t idx = item.get_local_linear_id();
78+
if (idx == 0) {
79+
loc_barrier[0].initialize(N);
80+
}
81+
if (idx == 1) {
82+
loc_barrier[1].initialize(N * N);
83+
}
84+
item.barrier(access::fence_space::local_space);
85+
86+
item.async_work_group_copy(loc.get_pointer(), data_acc.get_pointer(),
87+
N);
88+
loc_barrier[1].arrive_copy_async();
89+
barrier::arrival_token arr = loc_barrier[1].arrive_no_complete(N - 1);
90+
loc_barrier[1].arrive_and_wait();
91+
92+
if (idx == 0) {
93+
loc_barrier[0].invalidate();
94+
int *reused_barrier_space = (int *)(void *)loc_barrier.get_pointer();
95+
*reused_barrier_space = loc[0];
96+
loc[0] = 0;
97+
}
98+
item.barrier(access::fence_space::local_space);
99+
if (idx == 1) {
100+
int *reused_barrier_space = (int *)(void *)loc_barrier.get_pointer();
101+
loc[0] = *reused_barrier_space;
102+
}
103+
item.barrier(access::fence_space::local_space);
104+
if (idx == 0) {
105+
loc_barrier[0].initialize(N);
106+
}
107+
108+
int val = loc[idx];
109+
arr = loc_barrier[0].arrive();
110+
val = (val + 1) % N;
111+
int dst_idx = (idx + 1) % N;
112+
loc_barrier[0].wait(arr);
113+
loc[dst_idx] = val;
114+
loc_barrier[0].wait(loc_barrier[0].arrive());
115+
116+
item.async_work_group_copy(data_acc.get_pointer(), loc.get_pointer(),
117+
N);
118+
loc_barrier[1].arrive_copy_async_no_inc();
119+
loc_barrier[1].arrive_no_complete(N - 3);
120+
arr = loc_barrier[1].arrive();
121+
test1_acc[idx] = loc_barrier[1].test_wait(arr);
122+
arr = loc_barrier[1].arrive();
123+
item.barrier(access::fence_space::local_space);
124+
test2_acc[idx] = loc_barrier[1].test_wait(arr);
125+
loc_barrier[1].wait(arr);
126+
127+
loc_barrier[1].arrive_no_complete(N - 6);
128+
loc_barrier[1].arrive_and_drop_no_complete(5);
129+
arr = loc_barrier[1].arrive_and_drop();
130+
loc_barrier[1].wait(arr);
131+
132+
for (int i = 0; i < N - 6; i++) {
133+
arr = loc_barrier[1].arrive();
134+
}
135+
loc_barrier[1].wait(arr);
136+
});
137+
});
138+
}
139+
for (int i = 0; i < N; i++) {
140+
assert(data[i] == i);
141+
assert(test1[i] == 0);
142+
assert(test2[i] == 1);
143+
}
144+
}
145+
146+
int main() {
147+
queue q;
148+
if (!q.get_device().has(aspect::ext_oneapi_cuda_async_barrier)) {
149+
std::cout << "Barrier is not supported by the device. Skipping test."
150+
<< std::endl;
151+
return 0;
152+
}
153+
basic();
154+
interface();
155+
156+
return 0;
157+
}

0 commit comments

Comments
 (0)