Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 32eac9a

Browse files
committed
Merge branch 'intel' into bfloat16-class-tests
2 parents 0ede881 + 83bbe77 commit 32eac9a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+1856
-210
lines changed

SYCL/Assert/assert_in_simultaneous_kernels.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@
1616
// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.txt
1717
// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.txt
1818
//
19-
// CHECK: {{.*}}assert_in_simultaneous_kernels.hpp:12: void assertFunc(): global id: [9,7,0], local id: [0,0,0]
19+
// CHECK: {{.*}}assert_in_simultaneous_kernels.hpp:13: void assertFunc(): global id: [9,7,0], local id: [0,0,0]
2020
// CHECK-SAME: Assertion `false && "from assert statement"` failed.
2121
// CHECK-NOT: The test ended.
2222
//
23-
// CHECK-ACC-NOT: {{.*}}assert_in_simultaneous_kernels.hpp:12: void assertFunc(): global id: [9,7,0], local id: [0,0,0]
23+
// CHECK-ACC-NOT: {{.*}}assert_in_simultaneous_kernels.hpp:13: void assertFunc(): global id: [9,7,0], local id: [0,0,0]
2424
// CHECK-ACC: The test ended.
2525

2626
#include "assert_in_simultaneous_kernels.hpp"

SYCL/Assert/assert_in_simultaneous_kernels.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <CL/sycl.hpp>
22
#include <cassert>
3+
#include <cstdio>
34
#include <iostream>
45
#include <thread>
56

@@ -44,6 +45,15 @@ void runTestForTid(queue *Q, size_t Tid) {
4445
}
4546

4647
int main(int Argc, const char *Argv[]) {
48+
// On windows stderr output becomes messed up if several thread
49+
// output simultaneously. Hence, setting explicit line buffering here.
50+
#ifndef __SYCL_DEVICE_ONLY__
51+
if (setvbuf(stderr, nullptr, _IOLBF, BUFSIZ)) {
52+
std::cerr << "Can't set line-buffering mode fo stderr\n";
53+
return 1;
54+
}
55+
#endif
56+
4757
std::vector<std::thread> threadPool;
4858
threadPool.reserve(NUM_THREADS);
4959

SYCL/Assert/assert_in_simultaneous_kernels_win.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@
1717
//
1818
// FIXME Windows version prints '(null)' instead of '<unknown func>' once in a
1919
// while for some insane reason.
20-
// CHECK: {{.*}}assert_in_simultaneous_kernels.hpp:12: {{<unknown func>|(null)}}: global id: [9,7,0], local id: [0,0,0]
20+
// CHECK: {{.*}}assert_in_simultaneous_kernels.hpp:13: {{<unknown func>|(null)}}: global id: [9,7,0], local id: [0,0,0]
2121
// CHECK-SAME: Assertion `false && "from assert statement"` failed.
2222
// CHECK-NOT: The test ended.
2323
//
24-
// CHECK-ACC-NOT: {{.*}}assert_in_simultaneous_kernels.hpp:12: {{<unknown func>|(null)}}: global id: [9,7,0], local id: [0,0,0]
24+
// CHECK-ACC-NOT: {{.*}}assert_in_simultaneous_kernels.hpp:13: {{<unknown func>|(null)}}: global id: [9,7,0], local id: [0,0,0]
2525
// CHECK-ACC: The test ended.
2626

2727
#include "assert_in_simultaneous_kernels.hpp"

SYCL/Assert/assert_in_simultaneously_multiple_tus.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
// FIXME unsupported on CUDA and HIP until fallback libdevice becomes available
2-
// UNSUPPORTED: cuda || hip
3-
// clang-format off
4-
// Failed on Linux on unrelated change (FileCheck error: '.../assert_in_simultaneously_multiple_tus.cpp.tmp.txt' is empty)
5-
// clang-format on
6-
// REQUIRES: TEMPORARILY_DISABLED
2+
// FIXME flaky output on Level Zero
3+
// UNSUPPORTED: cuda || hip || level_zero
74
// RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%sycl_triple -I %S/Inputs %s %S/Inputs/kernels_in_file2.cpp -o %t.out %threads_lib
85
// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
96
// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
@@ -27,6 +24,7 @@
2724

2825
#include "Inputs/kernels_in_file2.hpp"
2926
#include <CL/sycl.hpp>
27+
#include <cstdio>
3028
#include <iostream>
3129
#include <thread>
3230

@@ -85,6 +83,15 @@ void runTestForTid(queue *Q, size_t Tid) {
8583
}
8684

8785
int main(int Argc, const char *Argv[]) {
86+
#ifndef __SYCL_DEVICE_ONLY__
87+
// On windows stderr output becomes messed up if several thread
88+
// output simultaneously. Hence, setting explicit line buffering here.
89+
if (setvbuf(stderr, nullptr, _IOLBF, BUFSIZ)) {
90+
std::cerr << "Can't set line-buffering mode fo stderr\n";
91+
return 1;
92+
}
93+
#endif
94+
8895
std::vector<std::thread> threadPool;
8996
threadPool.reserve(NUM_THREADS);
9097

Lines changed: 218 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,239 @@
1-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -O3 -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70
22
// RUN: %HOST_RUN_PLACEHOLDER %t.out
33
// RUN: %CPU_RUN_PLACEHOLDER %t.out
44
// RUN: %GPU_RUN_PLACEHOLDER %t.out
55
// RUN: %ACC_RUN_PLACEHOLDER %t.out
66
// L0, OpenCL, and HIP backends don't currently support
77
// info::device::atomic_memory_order_capabilities
8-
// UNSUPPORTED: level_zero || opencl || hip
8+
// UNSUPPORTED: level_zero, opencl, hip
99

10-
// NOTE: Tests load and store for acquire-release memory ordering.
10+
// host does not support barrier
11+
// XFAIL: host
12+
13+
// NOTE: Tests fetch_add for acquire and release memory ordering.
1114

1215
#include "atomic_memory_order.h"
1316
#include <iostream>
17+
#include <numeric>
1418
using namespace sycl;
1519

16-
int main() {
20+
template <memory_order order> void test_acquire_global() {
21+
const size_t N_items = 1024;
22+
const size_t N_iters = 1000;
23+
24+
int error = 0;
25+
int val[] = {0, 0};
26+
27+
queue q;
28+
{
29+
buffer<int> error_buf(&error, 1);
30+
buffer<int> val_buf(val, 1);
31+
32+
q.submit([&](handler &cgh) {
33+
auto error =
34+
error_buf.template get_access<access::mode::read_write>(cgh);
35+
auto val = val_buf.template get_access<access::mode::read_write>(cgh);
36+
cgh.parallel_for(range<1>(N_items), [=](item<1> it) {
37+
volatile int *val_p = val.get_pointer();
38+
auto atm0 =
39+
atomic_ref<int, memory_order::relaxed, memory_scope::device,
40+
access::address_space::global_space>(val[0]);
41+
auto atm1 =
42+
atomic_ref<int, memory_order::relaxed, memory_scope::device,
43+
access::address_space::global_space>(val[1]);
44+
for (int i = 0; i < N_iters; i++) {
45+
if (it.get_id(0) == 0) {
46+
atm0.fetch_add(1, order);
47+
val_p[1]++;
48+
} else {
49+
int tmp1 = atm1.load(memory_order::acquire);
50+
int tmp0 = atm0.load(memory_order::relaxed);
51+
if (tmp0 < tmp1) {
52+
error[0] = 1;
53+
}
54+
}
55+
}
56+
});
57+
}).wait_and_throw();
58+
}
59+
assert(error == 0);
60+
}
61+
62+
template <memory_order order> void test_acquire_local() {
63+
const size_t local_size = 1024;
64+
const size_t N_wgs = 16;
65+
const size_t global_size = local_size * N_wgs;
66+
const size_t N_iters = 1000;
67+
68+
int error = 0;
69+
int val[] = {0, 0};
70+
71+
queue q;
72+
{
73+
buffer<int> error_buf(&error, 1);
74+
buffer<int> val_buf(val, 1);
75+
76+
q.submit([&](handler &cgh) {
77+
auto error =
78+
error_buf.template get_access<access::mode::read_write>(cgh);
79+
accessor<int, 1, access::mode::read_write, access::target::local> val(
80+
2, cgh);
81+
cgh.parallel_for(
82+
nd_range<1>(global_size, local_size), [=](nd_item<1> it) {
83+
size_t lid = it.get_local_id(0);
84+
val[0] = 0;
85+
val[1] = 0;
86+
it.barrier(access::fence_space::local_space);
87+
volatile int *val_p = val.get_pointer();
88+
auto atm0 =
89+
atomic_ref<int, memory_order::relaxed, memory_scope::device,
90+
access::address_space::local_space>(val[0]);
91+
auto atm1 =
92+
atomic_ref<int, memory_order::relaxed, memory_scope::device,
93+
access::address_space::local_space>(val[1]);
94+
for (int i = 0; i < N_iters; i++) {
95+
if (it.get_local_id(0) == 0) {
96+
atm0.fetch_add(1, order);
97+
val_p[1]++;
98+
} else {
99+
int tmp1 = atm1.load(memory_order::acquire);
100+
int tmp0 = atm0.load(memory_order::relaxed);
101+
if (tmp0 < tmp1) {
102+
error[0] = 1;
103+
}
104+
}
105+
}
106+
});
107+
}).wait_and_throw();
108+
}
109+
assert(error == 0);
110+
}
111+
112+
template <memory_order order> void test_release_global() {
113+
const size_t N_items = 1024;
114+
const size_t N_iters = 1000;
115+
116+
int error = 0;
117+
int val[] = {0, 0};
118+
17119
queue q;
120+
{
121+
buffer<int> error_buf(&error, 1);
122+
buffer<int> val_buf(val, 1);
123+
124+
q.submit([&](handler &cgh) {
125+
auto error =
126+
error_buf.template get_access<access::mode::read_write>(cgh);
127+
auto val = val_buf.template get_access<access::mode::read_write>(cgh);
128+
cgh.parallel_for(range<1>(N_items), [=](item<1> it) {
129+
volatile int *val_p = val.get_pointer();
130+
auto atm0 =
131+
atomic_ref<int, memory_order::relaxed, memory_scope::device,
132+
access::address_space::global_space>(val[0]);
133+
auto atm1 =
134+
atomic_ref<int, memory_order::relaxed, memory_scope::device,
135+
access::address_space::global_space>(val[1]);
136+
for (int i = 0; i < N_iters; i++) {
137+
if (it.get_id(0) == 0) {
138+
val_p[0]++;
139+
atm1.fetch_add(1, order);
140+
} else {
141+
int tmp1 = atm1.load(memory_order::acquire);
142+
int tmp0 = atm0.load(memory_order::relaxed);
143+
if (tmp0 < tmp1) {
144+
error[0] = 1;
145+
}
146+
}
147+
}
148+
});
149+
}).wait_and_throw();
150+
}
151+
assert(error == 0);
152+
}
153+
154+
template <memory_order order> void test_release_local() {
155+
const size_t local_size = 1024;
156+
const size_t N_wgs = 16;
157+
const size_t global_size = local_size * N_wgs;
158+
const size_t N_iters = 1000;
18159

160+
int error = 0;
161+
int val[] = {0, 0};
162+
163+
queue q;
164+
{
165+
buffer<int> error_buf(&error, 1);
166+
buffer<int> val_buf(val, 1);
167+
168+
q.submit([&](handler &cgh) {
169+
auto error =
170+
error_buf.template get_access<access::mode::read_write>(cgh);
171+
accessor<int, 1, access::mode::read_write, access::target::local> val(
172+
2, cgh);
173+
cgh.parallel_for(
174+
nd_range<1>(global_size, local_size), [=](nd_item<1> it) {
175+
size_t lid = it.get_local_id(0);
176+
val[0] = 0;
177+
val[1] = 0;
178+
it.barrier(access::fence_space::local_space);
179+
volatile int *val_p = val.get_pointer();
180+
auto atm0 =
181+
atomic_ref<int, memory_order::relaxed, memory_scope::device,
182+
access::address_space::local_space>(val[0]);
183+
auto atm1 =
184+
atomic_ref<int, memory_order::relaxed, memory_scope::device,
185+
access::address_space::local_space>(val[1]);
186+
for (int i = 0; i < N_iters; i++) {
187+
if (it.get_local_id(0) == 0) {
188+
val_p[0]++;
189+
atm1.fetch_add(1, order);
190+
} else {
191+
int tmp1 = atm1.load(memory_order::acquire);
192+
int tmp0 = atm0.load(memory_order::relaxed);
193+
if (tmp0 < tmp1) {
194+
error[0] = 1;
195+
}
196+
}
197+
}
198+
});
199+
}).wait_and_throw();
200+
}
201+
assert(error == 0);
202+
}
203+
204+
int main() {
205+
queue q;
19206
std::vector<memory_order> supported_memory_orders =
20207
q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
21208

22-
if (!is_supported(supported_memory_orders, memory_order::acq_rel)) {
23-
std::cout << "Skipping test\n";
24-
return 0;
209+
if (is_supported(supported_memory_orders, memory_order::acquire)) {
210+
std::cout << "Testing acquire" << std::endl;
211+
test_acquire_global<memory_order::acquire>();
212+
test_acquire_local<memory_order::acquire>();
213+
}
214+
if (is_supported(supported_memory_orders, memory_order::release)) {
215+
std::cout << "Testing release" << std::endl;
216+
test_release_global<memory_order::release>();
217+
test_release_local<memory_order::release>();
218+
}
219+
if (is_supported(supported_memory_orders, memory_order::acq_rel)) {
220+
std::cout << "Testing acq_rel" << std::endl;
221+
// Acquire-release memory order must also support both acquire and release
222+
// orderings.
223+
assert(is_supported(supported_memory_orders, memory_order::acquire) &&
224+
is_supported(supported_memory_orders, memory_order::release));
225+
test_acquire_global<memory_order::acq_rel>();
226+
test_acquire_local<memory_order::acq_rel>();
227+
test_release_global<memory_order::acq_rel>();
228+
test_release_local<memory_order::acq_rel>();
229+
}
230+
if (is_supported(supported_memory_orders, memory_order::seq_cst)) {
231+
std::cout << "Testing seq_cst" << std::endl;
232+
test_acquire_global<memory_order::seq_cst>();
233+
test_acquire_local<memory_order::seq_cst>();
234+
test_release_global<memory_order::seq_cst>();
235+
test_release_local<memory_order::seq_cst>();
25236
}
26-
27-
// Acquire-release memory order must also support both acquire and release
28-
// orderings.
29-
assert(is_supported(supported_memory_orders, memory_order::acquire) &&
30-
is_supported(supported_memory_orders, memory_order::release));
31237

32238
std::cout << "Test passed." << std::endl;
33239
}

0 commit comments

Comments
 (0)