Skip to content

Commit adbf7ab

Browse files
authored
[ESIMD] E2E test for slm_allocator API. (intel#1449)
* [ESIMD] E2E test for slm_allocator API. Signed-off-by: Konstantin S Bobrovsky <[email protected]>
1 parent a73cbff commit adbf7ab

File tree

3 files changed

+484
-0
lines changed

3 files changed

+484
-0
lines changed

SYCL/ESIMD/slm_alloc.cpp

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
// REQUIRES: gpu
2+
// UNSUPPORTED: cuda || hip
3+
//
4+
// RUN: %clangxx -fsycl %s -o %t.1.out
5+
// RUN: %GPU_RUN_PLACEHOLDER %t.1.out
6+
//
7+
// Vary the test case by forcing inlining of the functions with slm_allocator:
8+
// RUN: %clangxx -fsycl -DFORCE_INLINE %s -o %t.2.out
9+
// RUN: %GPU_RUN_PLACEHOLDER %t.2.out
10+
11+
// This is end-to-end test for the slm_allocator API used together with the
12+
// slm_init. The call graph is:
13+
// Test1(kernel) - uses slm_init(SLM_IN_KERNEL)
14+
// / \
15+
// / v
16+
// / bar - uses slm_allocator(SLM_IN_BAR)
17+
// v
18+
// foo - uses slm_allocator(SLM_IN_FOO)
19+
// Test1 kernel SLM usage is SLM_IN_KERNEL + max(SLM_IN_BAR, SLM_IN_FOO).
20+
// SLM offset returned by the slm_allocator in foo and bar is the same and is
21+
// SLM_IN_KERNEL bytes.
22+
// Bar uses slightly bigger SLM frame than foo. It modifies values (adds 10) in
23+
// SLM resulting from foo, plus appends couple more '100's.
24+
25+
#include <iostream>
26+
#include <sycl/ext/intel/esimd.hpp>
27+
#include <sycl/sycl.hpp>
28+
29+
using namespace sycl;
30+
using namespace sycl::ext::intel::esimd;
31+
using namespace sycl::ext::intel::experimental::esimd;
32+
33+
using T = int;
34+
constexpr int LOCAL_SIZE = 4;
35+
constexpr int GLOBAL_SIZE = 8;
36+
37+
constexpr int NUM_WGS = GLOBAL_SIZE / LOCAL_SIZE;
38+
39+
constexpr int ELEM_SIZE = sizeof(T);
40+
41+
constexpr int SLM_IN_KERNEL = LOCAL_SIZE * ELEM_SIZE;
42+
constexpr int SLM_IN_FOO = LOCAL_SIZE * ELEM_SIZE;
43+
constexpr int BAR_EXTRA_ELEMS = 2;
44+
constexpr int SLM_IN_BAR = SLM_IN_FOO + BAR_EXTRA_ELEMS * ELEM_SIZE;
45+
constexpr int SLM_TOTAL = SLM_IN_KERNEL + std::max(SLM_IN_FOO, SLM_IN_BAR);
46+
constexpr int BAR_MARKER1 = 10;
47+
constexpr int BAR_MARKER2 = 100;
48+
49+
#ifdef FORCE_INLINE
50+
constexpr bool force_inline = true;
51+
inline
52+
__attribute__((always_inline))
53+
#else
54+
constexpr bool force_inline = false;
55+
__attribute__((noinline))
56+
#endif // FORCE_INLINE
57+
void
58+
foo(int local_id) {
59+
slm_allocator<SLM_IN_FOO> a;
60+
uint32_t slm_off = a.get_offset();
61+
// write data chunk "Y":
62+
slm_scalar_store(slm_off + local_id * ELEM_SIZE, (T)local_id);
63+
}
64+
65+
#ifdef FORCE_INLINE
66+
inline
67+
__attribute__((always_inline))
68+
#else
69+
__attribute__((noinline))
70+
#endif // FORCE_INLINE
71+
void
72+
bar(int local_id) {
73+
slm_allocator<SLM_IN_BAR> a;
74+
uint32_t slm_off = a.get_offset();
75+
uint32_t off = slm_off + local_id * ELEM_SIZE;
76+
T v = slm_scalar_load<T>(off);
77+
// update data chunk "Y":
78+
slm_scalar_store(off, v + BAR_MARKER1);
79+
80+
if (local_id == 0) {
81+
for (int i = 0; i < BAR_EXTRA_ELEMS; i++) {
82+
// write data chunk "Z":
83+
slm_scalar_store((2 * LOCAL_SIZE + i) * ELEM_SIZE, (T)BAR_MARKER2);
84+
}
85+
}
86+
}
87+
88+
int main(void) {
89+
queue q;
90+
auto dev = q.get_device();
91+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
92+
std::cout << "force_inline=" << force_inline << "\n";
93+
auto ctxt = q.get_context();
94+
uint32_t size = SLM_TOTAL * NUM_WGS / ELEM_SIZE;
95+
96+
T *arr = malloc_shared<T>(size, dev, ctxt);
97+
98+
auto e = q.submit([&](handler &cgh) {
99+
cgh.parallel_for<class Test1>(
100+
nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE),
101+
[=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
102+
slm_init(SLM_IN_KERNEL);
103+
int local_id = ndi.get_local_linear_id();
104+
int group_id = ndi.get_group_linear_id();
105+
106+
// write data chunk "X":
107+
slm_scalar_store(local_id * ELEM_SIZE, local_id);
108+
barrier();
109+
110+
foo(local_id);
111+
barrier();
112+
113+
bar(local_id);
114+
barrier();
115+
116+
// copy data from SLM to the output for further verification
117+
if (local_id == 0) {
118+
uint32_t group_off = SLM_TOTAL * group_id;
119+
120+
for (int i = 0; i < SLM_TOTAL / ELEM_SIZE; i++) {
121+
uint32_t slm_off = i * ELEM_SIZE;
122+
uint32_t mem_off = group_off + slm_off;
123+
scatter(arr, simd<uint32_t, 1>(mem_off),
124+
simd<T, 1>(slm_scalar_load<T>(slm_off)));
125+
}
126+
}
127+
});
128+
});
129+
e.wait();
130+
131+
for (int i = 0; i < NUM_WGS * SLM_TOTAL / ELEM_SIZE; i++) {
132+
std::cout << " " << arr[i];
133+
if ((i + 1) % 10 == 0) {
134+
std::cout << "\n";
135+
}
136+
}
137+
std::cout << "\n";
138+
int err_cnt = 0;
139+
140+
for (int g = 0; g < NUM_WGS; g++) {
141+
uint32_t group_off = SLM_TOTAL * g / ELEM_SIZE;
142+
for (int i = 0; i < LOCAL_SIZE; i++) {
143+
int ind = group_off + i;
144+
145+
// check data copied from kernel's SLM frame ("X")
146+
auto test = arr[ind];
147+
auto gold = i;
148+
149+
if (test != gold) {
150+
if (++err_cnt < 10) {
151+
std::cerr << "*** ERROR (X) at " << ind << ": " << test
152+
<< " != " << gold << " (gold)\n";
153+
}
154+
}
155+
// check data copied from the overlapping part of foo's and bar's SLM
156+
// frames - "Y"
157+
ind = ind + LOCAL_SIZE; // shift to the foo/bar SLM frame
158+
test = arr[ind];
159+
gold = i + BAR_MARKER1;
160+
161+
if (test != gold) {
162+
if (++err_cnt < 10) {
163+
std::cerr << "*** ERROR (Y) at " << ind << ": " << test
164+
<< " != " << gold << " (gold)\n";
165+
}
166+
}
167+
}
168+
// now check data written by bar past the overlapping part of foo/bar SLM
169+
// frame - "Z"
170+
for (int i = 0; i < BAR_EXTRA_ELEMS; i++) {
171+
int ind =
172+
group_off + 2 /*kernel's and foo's SLM frames*/ * LOCAL_SIZE + i;
173+
auto test = arr[ind];
174+
auto gold = BAR_MARKER2;
175+
176+
if (test != gold) {
177+
if (++err_cnt < 10) {
178+
std::cerr << "*** ERROR (Z) at " << ind << ": " << test
179+
<< " != " << gold << " (gold)\n";
180+
}
181+
}
182+
}
183+
}
184+
std::cout << (err_cnt ? "FAILED\n" : "Passed\n");
185+
return err_cnt ? 1 : 0;
186+
}
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
// REQUIRES: gpu
2+
// UNSUPPORTED: cuda || hip
3+
//
4+
// RUN: %clangxx -fsycl %s -o %t.1.out
5+
// RUN: %GPU_RUN_PLACEHOLDER %t.1.out
6+
//
7+
// Vary the test case by forcing inlining of the functions with slm_allocator:
8+
// RUN: %clangxx -fsycl -DFORCE_INLINE %s -o %t.2.out
9+
// RUN: %GPU_RUN_PLACEHOLDER %t.2.out
10+
11+
// Checks validity of SLM frame offsets in case of complex call graph with two
12+
// kernels and 2 functions all using SLM, and one of the functions using two
13+
// slm_allocator objects with nested liveness ranges.
14+
15+
// Hierarchy of SLM frames:
16+
// N1 N2
17+
// | \ |
18+
// | \ |
19+
// v \ |
20+
// X \ |
21+
// | \ \ |
22+
// | \ \|
23+
// v \---v
24+
// Y Z
25+
//
26+
// SLM offsets are expected to be:
27+
// 1) no inlining case:
28+
// --- Kernel0
29+
// X - N1
30+
// Y - N1 + X
31+
// Z - max(N1 + X, N2)
32+
// --- Kernel2
33+
// X - 0 (not reachable, offset not updated in the result)
34+
// Y - 0 (not reachable, offset not updated in the result)
35+
// Z - max(N1 + X, N2)
36+
// 2) forced inlining case:
37+
// --- Kernel0
38+
// X - N1
39+
// Y - N1 + X
40+
// Z - N1 // this is because Z (bar) is inlined into X (foo) and into
41+
// // N1 (kernel1), and execution of the second inlined scope
42+
// // allocation and offset recording into the result happens last.
43+
// --- Kernel2
44+
// X - 0 (not reachable, offset not updated in the result)
45+
// Y - 0 (not reachable, offset not updated in the result)
46+
// Z - N2
47+
// Note the difference in SLM offset for Z in the inlining/no-inlining cases for
48+
// Z scope. This is because inlining effectively splits Z scope when inlining
49+
// into kernel0 and kernel2
50+
51+
#include <sycl/ext/intel/esimd.hpp>
52+
#include <sycl/sycl.hpp>
53+
54+
#include <cstring>
55+
#include <iostream>
56+
57+
using namespace sycl;
58+
using namespace sycl::ext::intel::esimd;
59+
using namespace sycl::ext::intel::experimental::esimd;
60+
61+
using T = uint32_t;
62+
63+
constexpr int SLM_N1 = 7;
64+
constexpr int SLM_N2 = 1;
65+
constexpr int SLM_X = 8;
66+
constexpr int SLM_Y = 16;
67+
constexpr int SLM_Z = 4;
68+
69+
constexpr int LOCAL_SIZE = 2;
70+
constexpr int GLOBAL_SIZE = 2;
71+
72+
template <class T> void scalar_store(T *base, uint32_t off, T val) {
73+
scatter<T, 1>(base, simd<uint32_t, 1>(off * sizeof(T)), val);
74+
}
75+
76+
// Result array format
77+
// |---- kernel0 ----| |---- kernel2 ----|
78+
// x_off, y_off, z_off, x_off, y_off, z_off
79+
80+
// Offsets in the result sub-array, to store each checked SLM frame offset at.
81+
enum { x_off_ind, y_off_ind, z_off_ind, num_offs };
82+
83+
// Offsets of sub-arrays
84+
enum { kernel0_base = x_off_ind, kernel1_base = num_offs };
85+
86+
#define STORE_SLM_OFF(ID, off) \
87+
if (local_id == 0) { \
88+
scalar_store(out, base + ID##_off_ind, off); \
89+
}
90+
91+
#ifdef FORCE_INLINE
92+
constexpr bool force_inline = true;
93+
#define INLINE_CTL inline __attribute__((always_inline))
94+
#else
95+
constexpr bool force_inline = false;
96+
#define INLINE_CTL __attribute__((noinline))
97+
#endif // FORCE_INLINE
98+
99+
INLINE_CTL void bar(int local_id, T *out, unsigned base) {
100+
slm_allocator<SLM_Z> a;
101+
unsigned z_off = a.get_offset();
102+
STORE_SLM_OFF(z, z_off);
103+
}
104+
105+
INLINE_CTL void foo(int local_id, T *out, unsigned base) {
106+
slm_allocator<SLM_X> a;
107+
unsigned x_off = a.get_offset();
108+
STORE_SLM_OFF(x, x_off);
109+
bar(local_id, out, base);
110+
{
111+
slm_allocator<SLM_Y> b;
112+
unsigned y_off = b.get_offset();
113+
STORE_SLM_OFF(y, y_off);
114+
}
115+
}
116+
117+
int main(void) {
118+
queue q;
119+
auto dev = q.get_device();
120+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
121+
std::cout << "force_inline=" << force_inline << "\n";
122+
auto ctxt = q.get_context();
123+
124+
constexpr int num_kernels = 2;
125+
T *arr = malloc_shared<T>(num_kernels * num_offs, dev, ctxt);
126+
std::memset(arr, 0, num_kernels * num_offs * sizeof(T));
127+
128+
auto e = q.submit([&](handler &cgh) {
129+
cgh.parallel_for<class Kernel0>(nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE),
130+
[=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
131+
slm_init(SLM_N1);
132+
int local_id = ndi.get_local_linear_id();
133+
foo(local_id, arr, kernel0_base);
134+
bar(local_id, arr, kernel0_base);
135+
});
136+
});
137+
e.wait();
138+
139+
e = q.submit([&](handler &cgh) {
140+
cgh.parallel_for<class Kernel2>(nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE),
141+
[=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
142+
slm_init(SLM_N2);
143+
int local_id = ndi.get_local_linear_id();
144+
bar(local_id, arr, kernel1_base);
145+
});
146+
});
147+
e.wait();
148+
149+
T gold_arr[num_kernels * num_offs];
150+
T *gold_arr0 = &gold_arr[kernel0_base];
151+
T *gold_arr1 = &gold_arr[kernel1_base];
152+
153+
// For kernel0 inline/no-inline results are the same for X and Y:
154+
// X - N1
155+
// Y - N1 + X
156+
// Z - max(N1 + X, N2)
157+
gold_arr0[x_off_ind] = SLM_N1;
158+
gold_arr0[y_off_ind] = SLM_N1 + SLM_X;
159+
160+
// For kernel0 inline/no-inline case splits for Z:
161+
#ifdef FORCE_INLINE
162+
gold_arr0[z_off_ind] = SLM_N1;
163+
#else
164+
gold_arr0[z_off_ind] = std::max(SLM_N1 + SLM_X, SLM_N2);
165+
#endif // FORCE_INLINE
166+
167+
// For kernel1 inline/no-inline results are the same for X and Y:
168+
// X - 0
169+
// Y - 0
170+
gold_arr1[x_off_ind] = 0;
171+
gold_arr1[y_off_ind] = 0;
172+
173+
// For kernel1 inline/no-inline case splits for Z:
174+
#ifdef FORCE_INLINE
175+
gold_arr1[z_off_ind] = SLM_N2;
176+
#else
177+
gold_arr1[z_off_ind] = std::max(SLM_N1 + SLM_X, SLM_N2);
178+
#endif // FORCE_INLINE
179+
180+
T *test_arr = arr;
181+
int err_cnt = 0;
182+
183+
T kernel_bases[num_kernels] = {kernel0_base, kernel1_base};
184+
185+
for (int k = 0; k < num_kernels; k++) {
186+
std::cout << "Kernel " << k << "\n";
187+
188+
for (int i = 0; i < num_offs; i++) {
189+
T test = test_arr[kernel_bases[k] + i];
190+
T gold = gold_arr[kernel_bases[k] + i];
191+
192+
if (test != gold) {
193+
++err_cnt;
194+
std::cerr << " *** ERROR at [" << i << "]: " << test << " != " << gold
195+
<< "(gold)\n";
196+
} else {
197+
std::cout << " [" << i << "]: " << test << " == " << gold
198+
<< "(gold)\n";
199+
}
200+
}
201+
}
202+
std::cout << (err_cnt ? "FAILED\n" : "Passed\n");
203+
return err_cnt ? 1 : 0;
204+
}

0 commit comments

Comments
 (0)