Skip to content

Commit 05700f0

Browse files
authored
Add named_barriers tests (intel#1272)
add named_barriers tests
1 parent da119bf commit 05700f0

File tree

6 files changed

+984
-0
lines changed

6 files changed

+984
-0
lines changed
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
//==------------ exec_in_order.cpp - DPC++ ESIMD on-device test -----------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
// REQUIRES: gpu-intel-pvc || esimd_emulator
10+
// RUN: %clangxx -fsycl %s -o %t.out
11+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
12+
//
13+
// Test checks support of named barrier in ESIMD kernel.
14+
// Threads are executed in ascending order of their local ID and each thread
15+
// stores data to addresses that partially overlap with addresses used by
16+
// previous thread.
17+
18+
#include <sycl/ext/intel/esimd.hpp>
19+
#include <sycl/sycl.hpp>
20+
21+
#include <iostream>
22+
23+
using namespace sycl;
24+
using namespace sycl::ext::intel::esimd;
25+
using namespace sycl::ext::intel::experimental::esimd;
26+
27+
template <int case_num> class KernelID;
28+
29+
template <int case_num, unsigned Threads, unsigned Size, bool UseSLM,
30+
class QueueTY>
31+
bool test(QueueTY q) {
32+
constexpr unsigned Groups = 1;
33+
// number of ints stored by each thread
34+
constexpr unsigned VL = Size / Threads;
35+
36+
static_assert(Threads % 2 == 0, "Number of threads must be even");
37+
static_assert(
38+
Size % (2 * Threads) == 0,
39+
"Surface size must be evenly divisible by twice the number of threads");
40+
41+
// need to write at least 2 ints per thread in order to overlap
42+
static_assert(VL >= 2,
43+
"Surface size must be at least 2 times the number of threads");
44+
45+
std::cout << "Case #" << case_num << "\n\tTreads: " << Threads
46+
<< "\n\tInts per thread: " << VL
47+
<< "\n\tMemory: " << (UseSLM ? "local\n" : "global\n");
48+
49+
std::vector<int> out(Size, 0);
50+
51+
try {
52+
buffer<int, 1> buf(out.data(), out.size());
53+
54+
// workgroups
55+
sycl::range<1> GlobalRange{Groups};
56+
// threads in each group
57+
sycl::range<1> LocalRange{Threads};
58+
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
59+
60+
auto e = q.submit([&](handler &cgh) {
61+
auto acc = buf.template get_access<access::mode::write>(cgh);
62+
cgh.parallel_for<KernelID<case_num>>(
63+
Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
64+
// Threads - 1 named barriers required
65+
// but id 0 reserved for unnamed
66+
named_barrier_init<Threads>();
67+
68+
unsigned int idx = ndi.get_local_id(0);
69+
// overlaping offset
70+
unsigned int off = idx * VL * sizeof(int) / 2;
71+
72+
int flag = 0; // producer-consumer mode
73+
int producers = 2;
74+
int consumers = 2;
75+
simd<int, VL> val(idx);
76+
77+
if constexpr (UseSLM) {
78+
slm_init(Size * sizeof(int));
79+
lsc_slm_block_store<int, VL>(2 * off, simd<int, VL>(0));
80+
}
81+
barrier();
82+
83+
// Threads are executed in ascending order of their local ID and
84+
// each thread stores data to addresses that partially overlap with
85+
// addresses used by previous thread.
86+
87+
// idx == 0 skips this branch and goes straight to lsc_surf_store
88+
// idx == 1 signals barrier 1
89+
// idx == 2 signals barrier 2
90+
// idx == 3 signals barrier 3
91+
// and so on
92+
if (idx > 0) {
93+
int barrier_id = idx;
94+
named_barrier_signal(barrier_id, flag, producers, consumers);
95+
named_barrier_wait(barrier_id);
96+
}
97+
98+
if constexpr (UseSLM)
99+
lsc_slm_block_store<int, VL>(off, val);
100+
else
101+
lsc_block_store<int, VL>(acc, off, val);
102+
103+
lsc_fence();
104+
105+
// idx == 0 arrives here first and signals barrier 1
106+
// idx == 1 arrives here next and signals barrier 2
107+
// idx == 2 arrives here next and signals barrier 3
108+
// and so on, but last thread skipped this block
109+
if (idx < Threads - 1) {
110+
int barrier_id = idx + 1;
111+
named_barrier_signal(barrier_id, flag, producers, consumers);
112+
named_barrier_wait(barrier_id);
113+
}
114+
115+
barrier();
116+
if constexpr (UseSLM) {
117+
auto res = lsc_slm_block_load<int, VL>(2 * off);
118+
lsc_block_store<int, VL>(acc, 2 * off, res);
119+
}
120+
});
121+
});
122+
e.wait();
123+
} catch (sycl::exception const &e) {
124+
std::cout << "SYCL exception caught: " << e.what() << '\n';
125+
return false;
126+
}
127+
128+
bool passed = true;
129+
for (int i = 0; i < Size; i++) {
130+
int etalon = i * 2 * Threads / Size;
131+
if (etalon == Threads) // last stored chunk
132+
etalon -= 1;
133+
if (etalon > Threads) // excessive part of surface
134+
etalon = 0;
135+
if (out[i] != etalon) {
136+
passed = false;
137+
std::cout << "out[" << i << "]=" << out[i] << " vs " << etalon << "\n";
138+
}
139+
}
140+
141+
std::cout << (passed ? " Passed\n" : " FAILED\n");
142+
return passed;
143+
}
144+
145+
int main() {
146+
auto GPUSelector = gpu_selector{};
147+
auto q = queue{GPUSelector};
148+
auto dev = q.get_device();
149+
std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
150+
<< "\n";
151+
152+
bool passed = true;
153+
154+
passed &= test<1, 2, 4, true>(q);
155+
passed &= test<2, 2, 4, false>(q);
156+
157+
passed &= test<3, 4, 8, true>(q);
158+
passed &= test<4, 4, 8, false>(q);
159+
160+
passed &= test<5, 4, 8, true>(q);
161+
passed &= test<6, 4, 8, false>(q);
162+
163+
passed &= test<7, 8, 16, true>(q);
164+
passed &= test<8, 8, 16, false>(q);
165+
166+
passed &= test<9, 2, 8, true>(q);
167+
passed &= test<10, 2, 8, false>(q);
168+
169+
passed &= test<11, 4, 16, true>(q);
170+
passed &= test<12, 4, 16, false>(q);
171+
172+
passed &= test<13, 4, 32, true>(q);
173+
passed &= test<14, 4, 32, false>(q);
174+
175+
passed &= test<15, 8, 32, true>(q);
176+
passed &= test<16, 8, 32, false>(q);
177+
178+
passed &= test<17, 16, 64, true>(q);
179+
passed &= test<18, 16, 64, false>(q);
180+
181+
return passed ? 0 : 1;
182+
}
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
//==------- exec_in_order_branched.cpp - DPC++ ESIMD on-device test -------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
// REQUIRES: gpu-intel-pvc || esimd_emulator
10+
// RUN: %clangxx -fsycl %s -o %t.out
11+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
12+
//
13+
// Test checks support of named barrier in ESIMD kernel.
14+
// Threads are executed in ascending order of their local ID and each thread
15+
// stores data to addresses that partially overlap with addresses used by
16+
// previous thread. Same as "exec_in_order.cpp", but each thread in separate
17+
// 'if' branch.
18+
19+
#include <sycl/ext/intel/esimd.hpp>
20+
#include <sycl/sycl.hpp>
21+
22+
#include <iostream>
23+
24+
using namespace sycl;
25+
using namespace sycl::ext::intel::esimd;
26+
using namespace sycl::ext::intel::experimental::esimd;
27+
28+
template <int case_num> class KernelID;
29+
30+
template <int case_num, unsigned Size, bool UseSLM, class QueueTY>
31+
bool test(QueueTY q) {
32+
constexpr unsigned Groups = 1;
33+
constexpr unsigned Threads = 4;
34+
// number of ints stored by each thread
35+
constexpr unsigned VL = Size / Threads;
36+
37+
static_assert(
38+
Size % (2 * Threads) == 0,
39+
"Surface size must be evenly divisible by twice the number of threads");
40+
41+
// need to write at least 2 ints per thread in order to overlap
42+
static_assert(VL >= 2,
43+
"Surface size must be at least 2 times the number of threads");
44+
45+
std::cout << "Case #" << case_num << "\n\tTreads: " << Threads
46+
<< "\n\tInts per thread: " << VL
47+
<< "\n\tMemory: " << (UseSLM ? "local\n" : "global\n");
48+
49+
std::vector<int> out(Size, 0);
50+
51+
try {
52+
buffer<int, 1> buf(out.data(), out.size());
53+
54+
// workgroups
55+
sycl::range<1> GlobalRange{Groups};
56+
// threads in each group
57+
sycl::range<1> LocalRange{Threads};
58+
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
59+
60+
auto e = q.submit([&](handler &cgh) {
61+
auto acc = buf.get_access<access::mode::write>(cgh);
62+
cgh.parallel_for<KernelID<case_num>>(
63+
Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
64+
// Threads - 1 named barriers required
65+
// but id 0 reserved for unnamed
66+
named_barrier_init<Threads>();
67+
68+
unsigned int idx = ndi.get_local_id(0);
69+
// overlaping offset
70+
unsigned int off = idx * VL * sizeof(int) / 2;
71+
72+
int flag = 0; // producer-consumer mode
73+
int producers = 2;
74+
int consumers = 2;
75+
76+
simd<int, VL> val(idx);
77+
78+
if constexpr (UseSLM) {
79+
slm_init(Size * sizeof(int));
80+
lsc_slm_block_store<int, VL>(2 * off, simd<int, VL>(0));
81+
}
82+
83+
barrier();
84+
85+
// Threads are executed in ascending order of their local ID and
86+
// each thread stores data to addresses that partially overlap with
87+
// addresses used by previous thread.
88+
if (idx == 0) {
89+
if constexpr (UseSLM) {
90+
lsc_slm_block_store<int, VL>(off, val);
91+
} else {
92+
lsc_fence();
93+
lsc_block_store<int, VL>(acc, off, val);
94+
lsc_fence();
95+
}
96+
97+
// T0 signals barrier 1 and locks
98+
// waiting for first signal from T1
99+
const int barrier_id = 1;
100+
named_barrier_signal(barrier_id, flag, producers, consumers);
101+
named_barrier_wait(barrier_id);
102+
} else if (idx == 1) {
103+
// T1 signals barrier 1 and locks, waiting for signal from T0
104+
const int barrier_id = 1;
105+
named_barrier_signal(barrier_id, flag, producers, consumers);
106+
named_barrier_wait(barrier_id);
107+
108+
if constexpr (UseSLM) {
109+
lsc_slm_block_store<int, VL>(off, val);
110+
} else {
111+
lsc_fence();
112+
lsc_block_store<int, VL>(acc, off, val);
113+
lsc_fence();
114+
}
115+
116+
// T1 signals barrier 2 and locks
117+
// waiting for first signal from T2
118+
const int barrier_id2 = 2;
119+
named_barrier_signal(barrier_id2, flag, producers, consumers);
120+
named_barrier_wait(barrier_id2);
121+
} else if (idx == 2) {
122+
// T2 signals barrier 2 and locks
123+
// waiting for second signal from T1
124+
const int barrier_id = 2;
125+
named_barrier_signal(barrier_id, flag, producers, consumers);
126+
named_barrier_wait(barrier_id);
127+
128+
if constexpr (UseSLM) {
129+
lsc_slm_block_store<int, VL>(off, val);
130+
} else {
131+
lsc_fence();
132+
lsc_block_store<int, VL>(acc, off, val);
133+
lsc_fence();
134+
}
135+
136+
// T2 signals barrier 3 and locks, waiting for signal from T3
137+
const int barrier_id2 = 3;
138+
named_barrier_signal(barrier_id2, flag, producers, consumers);
139+
named_barrier_wait(barrier_id2);
140+
} else {
141+
// T3 signals barrier 3 and locks
142+
// waiting for second signal from T2
143+
const int barrier_id = 3;
144+
named_barrier_signal(barrier_id, flag, producers, consumers);
145+
named_barrier_wait(barrier_id);
146+
147+
if constexpr (UseSLM) {
148+
lsc_slm_block_store<int, VL>(off, val);
149+
} else {
150+
lsc_fence();
151+
lsc_block_store<int, VL>(acc, off, val);
152+
lsc_fence();
153+
}
154+
}
155+
156+
barrier();
157+
if constexpr (UseSLM) {
158+
auto res = lsc_slm_block_load<int, VL>(2 * off);
159+
lsc_block_store<int, VL>(acc, 2 * off, res);
160+
}
161+
});
162+
});
163+
e.wait();
164+
} catch (sycl::exception const &e) {
165+
std::cout << "SYCL exception caught: " << e.what() << '\n';
166+
return false;
167+
}
168+
169+
bool passed = true;
170+
for (int i = 0; i < Size; i++) {
171+
int etalon = i * 2 * Threads / Size;
172+
if (etalon == Threads) // last stored chunk
173+
etalon -= 1;
174+
if (etalon > Threads) // excessive part of surface
175+
etalon = 0;
176+
if (out[i] != etalon) {
177+
passed = false;
178+
std::cout << "out[" << i << "]=" << out[i] << " vs " << etalon << "\n";
179+
}
180+
}
181+
182+
std::cout << (passed ? " Passed\n" : " FAILED\n");
183+
return passed;
184+
}
185+
186+
int main() {
187+
auto GPUSelector = gpu_selector{};
188+
auto q = queue{GPUSelector};
189+
auto dev = q.get_device();
190+
std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
191+
<< "\n";
192+
193+
bool passed = true;
194+
195+
// Threads == 4
196+
passed &= test<1, 8, true>(q);
197+
passed &= test<2, 8, false>(q);
198+
199+
passed &= test<3, 16, true>(q);
200+
passed &= test<4, 16, false>(q);
201+
202+
passed &= test<5, 32, true>(q);
203+
passed &= test<6, 32, false>(q);
204+
205+
passed &= test<7, 64, true>(q);
206+
passed &= test<8, 64, false>(q);
207+
208+
passed &= test<9, 128, true>(q);
209+
passed &= test<10, 128, false>(q);
210+
211+
return passed ? 0 : 1;
212+
}

0 commit comments

Comments
 (0)