Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][CUDA] Add tests for asynchronous barrier #737

Merged
merged 7 commits into from
May 17, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 157 additions & 0 deletions SYCL/GroupAlgorithm/barrier.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// REQUIRES: cuda

#include "CL/sycl.hpp"
#include <iostream>
#include <vector>

using namespace sycl;
using namespace sycl::ext::oneapi::experimental::cuda;

void basic() {
queue q{};
int N = 64;
std::vector<int> data(N);
for (int i = 0; i < N; i++) {
data[i] = i;
}
{
buffer<int> buf(data.data(), N);

q.submit([&](handler &cgh) {
auto acc = buf.get_access<access::mode::read_write>(cgh);
accessor<int, 1, access::mode::read_write, access::target::local> loc(
N, cgh);
accessor<barrier, 1, access::mode::read_write, access::target::local>
loc_barrier(2, cgh);
cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> item) {
size_t idx = item.get_local_linear_id();
loc[idx] = acc[idx];
if (idx < 2) {
loc_barrier[idx].initialize(N);
}
item.barrier(access::fence_space::local_space);
for (int i = 0; i < N; i++) {
int val = loc[idx];
barrier::arrival_token arr = loc_barrier[0].arrive();
val += 1;
int dst_idx = (idx + 1) % N;
loc_barrier[0].wait(arr);
loc[dst_idx] = val;
loc_barrier[1].wait(loc_barrier[1].arrive());
}
acc[idx] = loc[idx];
});
});
}
for (int i = 0; i < N; i++) {
assert(data[i] == i + N);
}
}

void interface() {
queue q{};
int N = 64;
std::vector<int> data(N, -1);
std::vector<int> test1(N, -1);
std::vector<int> test2(N, -1);
for (int i = 0; i < N; i++) {
data[i] = i;
}
{
buffer<int> data_buf(data.data(), N);
buffer<int> test1_buf(test1.data(), N);
buffer<int> test2_buf(test2.data(), N);

q.submit([&](handler &cgh) {
auto data_acc = data_buf.get_access<access::mode::read_write>(cgh);
auto test1_acc = test1_buf.get_access<access::mode::read_write>(cgh);
auto test2_acc = test2_buf.get_access<access::mode::read_write>(cgh);
accessor<int, 1, access::mode::read_write, access::target::local> loc(
N, cgh);
accessor<barrier, 1, access::mode::read_write, access::target::local>
loc_barrier(2, cgh);
cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> item) {
size_t idx = item.get_local_linear_id();
if (idx == 0) {
loc_barrier[0].initialize(N);
}
if (idx == 1) {
loc_barrier[1].initialize(N * N);
}
item.barrier(access::fence_space::local_space);

item.async_work_group_copy(loc.get_pointer(), data_acc.get_pointer(),
N);
loc_barrier[1].arrive_copy_async();
barrier::arrival_token arr = loc_barrier[1].arrive_no_complete(N - 1);
loc_barrier[1].arrive_and_wait();

if (idx == 0) {
loc_barrier[0].invalidate();
int *reused_barrier_space = (int *)(void *)loc_barrier.get_pointer();
*reused_barrier_space = loc[0];
loc[0] = 0;
}
item.barrier(access::fence_space::local_space);
if (idx == 1) {
int *reused_barrier_space = (int *)(void *)loc_barrier.get_pointer();
loc[0] = *reused_barrier_space;
}
item.barrier(access::fence_space::local_space);
if (idx == 0) {
loc_barrier[0].initialize(N);
}

int val = loc[idx];
arr = loc_barrier[0].arrive();
val = (val + 1) % N;
int dst_idx = (idx + 1) % N;
loc_barrier[0].wait(arr);
loc[dst_idx] = val;
loc_barrier[0].wait(loc_barrier[0].arrive());

item.async_work_group_copy(data_acc.get_pointer(), loc.get_pointer(),
N);
loc_barrier[1].arrive_copy_async_no_inc();
loc_barrier[1].arrive_no_complete(N - 3);
arr = loc_barrier[1].arrive();
test1_acc[idx] = loc_barrier[1].test_wait(arr);
arr = loc_barrier[1].arrive();
item.barrier(access::fence_space::local_space);
test2_acc[idx] = loc_barrier[1].test_wait(arr);
loc_barrier[1].wait(arr);

loc_barrier[1].arrive_no_complete(N - 6);
loc_barrier[1].arrive_and_drop_no_complete(5);
arr = loc_barrier[1].arrive_and_drop();
loc_barrier[1].wait(arr);

for (int i = 0; i < N - 6; i++) {
arr = loc_barrier[1].arrive();
}
loc_barrier[1].wait(arr);
});
});
}
for (int i = 0; i < N; i++) {
assert(data[i] == i);
assert(test1[i] == 0);
assert(test2[i] == 1);
}
}

int main() {
queue q;
if (!q.get_device().has(aspect::ext_oneapi_cuda_async_barrier)) {
std::cout << "Barrier is not supported by the device. Skipping test."
<< std::endl;
return 0;
}
basic();
interface();

return 0;
}