Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit ab3efc1

Browse files
authored
[ESIMD][InvokeSimd] Add named barrier and SLM access tests (#1621)
1 parent c8d032f commit ab3efc1

File tree

5 files changed

+457
-0
lines changed

5 files changed

+457
-0
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// TODO: enable on Windows once driver is ready
2+
// NOTE: named barrier supported only since PVC
3+
// REQUIRES: gpu-intel-pvc && linux
4+
// UNSUPPORTED: cuda || hip
5+
//
6+
// TODO: enable when Jira issue resolved, currently fail with VISALTO enable
7+
// XFAIL: gpu-intel-pvc
8+
//
9+
// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../nbarrier_basic.cpp -o %t.out
10+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
//
12+
// VISALTO enable run
13+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
14+
15+
/*
16+
* This tests is the same as InvokeSimd/Regression/slm_load_store.cpp, but
17+
* compiles without optional subgroup attribute specified and intended to check
18+
* that compiler is able to choose subgroup size correctly.
19+
*/
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// TODO: enable on Windows once driver is ready
2+
// REQUIRES: gpu && linux
3+
// UNSUPPORTED: cuda || hip
4+
//
5+
// TODO: enable when Jira issue resolved
6+
// REQUIRES: TEMPORARY_DISABLED
7+
//
8+
// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../slm_load_store.cpp -o %t.out
9+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
10+
//
11+
// VISALTO enable run
12+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
13+
14+
/*
15+
* This tests is the same as InvokeSimd/Regression/slm_load_store.cpp, but
16+
* compiles without optional subgroup attribute specified and intended to check
17+
* that compiler is able to choose subgroup size correctly.
18+
*/
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
// TODO: enable on Windows once driver is ready
2+
// NOTE: named barrier supported only since PVC
3+
// REQUIRES: gpu-intel-pvc && linux
4+
// UNSUPPORTED: cuda || hip
5+
//
6+
// TODO: enable when Jira issue resolved, currently fail with VISALTO enable
7+
// XFAIL: gpu-intel-pvc
8+
//
9+
// RUN: %clangxx -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %s -o %t.out
10+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
//
12+
// VISALTO enable run
13+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
14+
15+
/*
16+
* Test checks basic support for named barriers in invoke_simd context.
17+
*/
18+
19+
#include <sycl/ext/intel/esimd.hpp>
20+
#include <sycl/ext/intel/experimental/esimd/memory.hpp>
21+
#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
22+
#include <sycl/sycl.hpp>
23+
24+
#include <functional>
25+
#include <iostream>
26+
#include <type_traits>
27+
28+
/* Subgroup size attribute is optional
29+
* In case it is absent compiler decides what subgroup size to use
30+
*/
31+
#ifdef IMPL_SUBGROUP
32+
#define SUBGROUP_ATTR
33+
#else
34+
#define SUBGROUP_ATTR [[intel::reqd_sub_group_size(VL)]]
35+
#endif
36+
37+
using namespace sycl;
38+
using namespace sycl::ext::oneapi::experimental;
39+
namespace esimd = sycl::ext::intel::esimd;
40+
namespace experimental_esimd = sycl::ext::intel::experimental::esimd;
41+
42+
constexpr int Size = 4096;
43+
constexpr int VL = 16;
44+
45+
class Test {};
46+
47+
ESIMD_INLINE void ESIMD_CALLEE_nbarrier(nd_item<1> *ndi) SYCL_ESIMD_FUNCTION {
48+
const uint8_t BARNUM = 32;
49+
experimental_esimd::named_barrier_init<BARNUM>();
50+
51+
uint8_t barrier_id = 1;
52+
uint8_t producer_consumer_mode = 0;
53+
uint32_t num_producers = 16;
54+
uint32_t num_consumers = 16;
55+
__ESIMD_ENS::named_barrier_signal(barrier_id, producer_consumer_mode,
56+
num_producers, num_consumers);
57+
__ESIMD_ENS::named_barrier_wait(barrier_id);
58+
}
59+
60+
[[intel::device_indirectly_callable]] SYCL_EXTERNAL void __regcall SIMD_CALLEE_nbarrier(
61+
nd_item<1> *ndi) SYCL_ESIMD_FUNCTION;
62+
63+
int main(void) {
64+
auto Queue = queue{gpu_selector_v};
65+
auto Device = Queue.get_device();
66+
67+
std::cout << "Running on " << Device.get_info<sycl::info::device::name>()
68+
<< "\n";
69+
try {
70+
// We need that many workgroups
71+
sycl::range<1> GlobalRange{16 * 16};
72+
// We need that many threads in each group
73+
sycl::range<1> LocalRange{16 * 16};
74+
75+
auto e = Queue.submit([&](handler &cgh) {
76+
cgh.parallel_for<Test>(nd_range<1>(GlobalRange, LocalRange),
77+
[=](nd_item<1> item) SUBGROUP_ATTR {
78+
sycl::group<1> g = item.get_group();
79+
sycl::sub_group sg = item.get_sub_group();
80+
invoke_simd(sg, SIMD_CALLEE_nbarrier,
81+
uniform{&item});
82+
});
83+
});
84+
e.wait();
85+
} catch (sycl::exception const &e) {
86+
std::cout << "SYCL exception caught: " << e.what() << '\n';
87+
return e.code().value();
88+
}
89+
90+
return 0;
91+
}
92+
93+
[[intel::device_indirectly_callable]] SYCL_EXTERNAL void __regcall SIMD_CALLEE_nbarrier(
94+
nd_item<1> *ndi) SYCL_ESIMD_FUNCTION {
95+
ESIMD_CALLEE_nbarrier(ndi);
96+
return;
97+
}
Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
// TODO: enable on Windows once driver is ready
2+
// NOTE: named barrier supported only since PVC
3+
// REQUIRES: gpu-intel-pvc && linux
4+
// UNSUPPORTED: cuda || hip
5+
//
6+
// TODO: enable when Jira issue resolved, currently fail with VISALTO enable
7+
// XFAIL: gpu-intel-pvc
8+
//
9+
// RUN: %clangxx -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %s -o %t.out
10+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
//
12+
// VISALTO enable run
13+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
14+
15+
/*
16+
* Test checks basic support for named barriers in invoke_simd context.
17+
* Threads are executed in ascending order of their local ID and each thread
18+
* stores data to addresses that partially overlap with addresses used by
19+
* previous thread.
20+
*/
21+
22+
#include <sycl/ext/intel/esimd.hpp>
23+
#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
24+
#include <sycl/sycl.hpp>
25+
26+
#include <sycl/ext/intel/esimd.hpp>
27+
#include <sycl/ext/intel/experimental/esimd/memory.hpp>
28+
#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
29+
#include <sycl/sycl.hpp>
30+
31+
#include <functional>
32+
#include <iostream>
33+
#include <type_traits>
34+
35+
using namespace sycl;
36+
using namespace sycl::ext::oneapi::experimental;
37+
namespace esimd = sycl::ext::intel::esimd;
38+
namespace experimental_esimd = sycl::ext::intel::experimental::esimd;
39+
40+
constexpr int VL = 16;
41+
template <int case_num> class KernelID;
42+
43+
template <unsigned Threads, unsigned Size, bool UseSLM>
44+
ESIMD_INLINE void ESIMD_CALLEE_nbarrier(int localID,
45+
int *o) SYCL_ESIMD_FUNCTION {
46+
// Threads - 1 named barriers required
47+
// but id 0 reserved for unnamed
48+
experimental_esimd::named_barrier_init<Threads>();
49+
50+
int flag = 0; // producer-consumer mode
51+
int producers = 2;
52+
int consumers = 2;
53+
54+
// overlaping offsets
55+
unsigned int off = VL * localID / 2;
56+
unsigned int off_slm = off * sizeof(int);
57+
esimd::simd<int, VL> val(localID);
58+
59+
if constexpr (UseSLM) {
60+
/* TODO: SLM has to be allocated of outside invoke_simd, but propper
61+
* intarface is not yet ready. Current test implementation in this regard
62+
* is a subject to future changes.
63+
*/
64+
esimd::slm_init(Size * sizeof(int));
65+
esimd::simd<int, VL> zero(0);
66+
experimental_esimd::lsc_slm_block_store<int, VL>(2 * off_slm, zero);
67+
}
68+
esimd::barrier();
69+
70+
// Threads are executed in ascending order of their local ID and
71+
// each thread stores data to addresses that partially overlap with
72+
// addresses used by previous thread.
73+
74+
// localID == 0 skips this branch and goes straight to lsc_surf_store
75+
// localID == 1 signals barrier 1
76+
// localID == 2 signals barrier 2
77+
// localID == 3 signals barrier 3
78+
// and so on
79+
if (localID > 0) {
80+
int barrier_id = localID;
81+
__ESIMD_ENS::named_barrier_signal(barrier_id, flag, producers, consumers);
82+
__ESIMD_ENS::named_barrier_wait(barrier_id);
83+
}
84+
85+
if constexpr (UseSLM)
86+
experimental_esimd::lsc_slm_block_store<int, VL>(off_slm, val);
87+
else
88+
experimental_esimd::lsc_block_store<int, VL>(o + off, val);
89+
90+
experimental_esimd::lsc_fence();
91+
92+
// localID == 0 arrives here first and signals barrier 1
93+
// localID == 1 arrives here next and signals barrier 2
94+
// localID == 2 arrives here next and signals barrier 3
95+
// and so on, but last thread skipped this block
96+
if (localID < Threads - 1) {
97+
int barrier_id = localID + 1;
98+
__ESIMD_ENS::named_barrier_signal(barrier_id, flag, producers, consumers);
99+
__ESIMD_ENS::named_barrier_wait(barrier_id);
100+
}
101+
102+
esimd::barrier();
103+
if constexpr (UseSLM) {
104+
auto res = experimental_esimd::lsc_slm_block_load<int, VL>(2 * off_slm);
105+
experimental_esimd::lsc_block_store<int, VL>(o + 2 * off, res);
106+
}
107+
}
108+
109+
template <unsigned Threads, unsigned Size, bool UseSLM>
110+
[[intel::device_indirectly_callable]] SYCL_EXTERNAL void __regcall SIMD_CALLEE_nbarrier(
111+
int localID, int *o) SYCL_ESIMD_FUNCTION {
112+
ESIMD_CALLEE_nbarrier<Threads, Size, UseSLM>(localID, o);
113+
}
114+
115+
template <int case_num, unsigned Threads, bool UseSLM, class QueueTY>
116+
bool test(QueueTY q) {
117+
// number of ints stored by each thread
118+
constexpr unsigned Size = VL * Threads;
119+
120+
static_assert(Threads % 2 == 0, "Number of threads must be even");
121+
std::cout << "Case #" << case_num << "\n\tTreads: " << Threads
122+
<< "\n\tInts per thread: " << VL
123+
<< "\n\tMemory: " << (UseSLM ? "local\n" : "global\n");
124+
125+
auto *out = malloc_shared<int>(Size, q);
126+
for (int i = 0; i < Size; i++) {
127+
out[i] = -1;
128+
}
129+
130+
try {
131+
// workgroups
132+
sycl::range<1> GlobalRange{Size};
133+
// threads in each group
134+
sycl::range<1> LocalRange{Threads};
135+
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
136+
137+
auto e = q.submit([&](handler &cgh) {
138+
cgh.parallel_for<KernelID<case_num>>(
139+
nd_range<1>(GlobalRange, LocalRange),
140+
// This test requires an explicit specification of the subgroup size
141+
[=](nd_item<1> item) [[intel::reqd_sub_group_size(VL)]] {
142+
sycl::group<1> g = item.get_group();
143+
sycl::sub_group sg = item.get_sub_group();
144+
uint32_t i =
145+
sg.get_group_linear_id() * VL + g.get_linear_id() * Threads;
146+
uint32_t wi_id = i + sg.get_local_id();
147+
// Thread local ID in ESIMD context
148+
int localID = wi_id / VL;
149+
invoke_simd(sg, SIMD_CALLEE_nbarrier<Threads, Size, UseSLM>,
150+
uniform{localID}, uniform{out});
151+
});
152+
});
153+
e.wait();
154+
} catch (sycl::exception const &e) {
155+
std::cout << "SYCL exception caught: " << e.what() << '\n';
156+
free(out, q);
157+
return false;
158+
}
159+
160+
bool passed = true;
161+
for (int i = 0; i < Size; i++) {
162+
int etalon = i * 2 * Threads / Size;
163+
if (etalon == Threads) // last stored chunk
164+
etalon -= 1;
165+
if (etalon > Threads) // excessive part of surface
166+
etalon = 0;
167+
if (out[i] != etalon) {
168+
passed = false;
169+
std::cout << "out[" << i << "]=" << out[i] << " vs " << etalon << "\n";
170+
}
171+
}
172+
173+
free(out, q);
174+
175+
std::cout << (passed ? " Passed\n" : " FAILED\n");
176+
return passed;
177+
}
178+
179+
int main() {
180+
auto q = queue{gpu_selector_v};
181+
auto dev = q.get_device();
182+
std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
183+
<< "\n";
184+
185+
bool passed = true;
186+
187+
passed &= test<1, 2, false>(q);
188+
passed &= test<2, 4, false>(q);
189+
passed &= test<3, 8, false>(q);
190+
passed &= test<4, 16, false>(q);
191+
192+
/* TODO: Enable the sub-tests with UseSLM=true after the issue with SLM
193+
* initialization is fixed.
194+
*
195+
passed &= test<5, 2, true>(q);
196+
passed &= test<6, 4, true>(q);
197+
passed &= test<7, 8, true>(q);
198+
passed &= test<8, 16, true>(q);
199+
*/
200+
201+
return passed ? 0 : 1;
202+
}

0 commit comments

Comments
 (0)