Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[ESIMD] E2E test for slm_allocator API. #1449

Merged
merged 8 commits into from
Jan 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 186 additions & 0 deletions SYCL/ESIMD/slm_alloc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
// REQUIRES: gpu
// UNSUPPORTED: cuda || hip
//
// RUN: %clangxx -fsycl %s -o %t.1.out
// RUN: %GPU_RUN_PLACEHOLDER %t.1.out
//
// Vary the test case by forcing inlining of the functions with slm_allocator:
// RUN: %clangxx -fsycl -DFORCE_INLINE %s -o %t.2.out
// RUN: %GPU_RUN_PLACEHOLDER %t.2.out

// This is end-to-end test for the slm_allocator API used together with the
// slm_init. The call graph is:
// Test1(kernel) - uses slm_init(SLM_IN_KERNEL)
// / \
// / v
// / bar - uses slm_allocator(SLM_IN_BAR)
// v
// foo - uses slm_allocator(SLM_IN_FOO)
// Test1 kernel SLM usage is SLM_IN_KERNEL + max(SLM_IN_BAR, SLM_IN_FOO).
// SLM offset returned by the slm_allocator in foo and bar is the same and is
// SLM_IN_KERNEL bytes.
// Bar uses slightly bigger SLM frame than foo. It modifies values (adds 10) in
// SLM resulting from foo, plus appends couple more '100's.

#include <iostream>
#include <sycl/ext/intel/esimd.hpp>
#include <sycl/sycl.hpp>

using namespace sycl;
using namespace sycl::ext::intel::esimd;
using namespace sycl::ext::intel::experimental::esimd;

using T = int;
constexpr int LOCAL_SIZE = 4;
constexpr int GLOBAL_SIZE = 8;

constexpr int NUM_WGS = GLOBAL_SIZE / LOCAL_SIZE;

constexpr int ELEM_SIZE = sizeof(T);

constexpr int SLM_IN_KERNEL = LOCAL_SIZE * ELEM_SIZE;
constexpr int SLM_IN_FOO = LOCAL_SIZE * ELEM_SIZE;
constexpr int BAR_EXTRA_ELEMS = 2;
constexpr int SLM_IN_BAR = SLM_IN_FOO + BAR_EXTRA_ELEMS * ELEM_SIZE;
constexpr int SLM_TOTAL = SLM_IN_KERNEL + std::max(SLM_IN_FOO, SLM_IN_BAR);
constexpr int BAR_MARKER1 = 10;
constexpr int BAR_MARKER2 = 100;

#ifdef FORCE_INLINE
constexpr bool force_inline = true;
inline
__attribute__((always_inline))
#else
constexpr bool force_inline = false;
__attribute__((noinline))
#endif // FORCE_INLINE
void
foo(int local_id) {
slm_allocator<SLM_IN_FOO> a;
uint32_t slm_off = a.get_offset();
// write data chunk "Y":
slm_scalar_store(slm_off + local_id * ELEM_SIZE, (T)local_id);
}

#ifdef FORCE_INLINE
inline
__attribute__((always_inline))
#else
__attribute__((noinline))
#endif // FORCE_INLINE
void
bar(int local_id) {
slm_allocator<SLM_IN_BAR> a;
uint32_t slm_off = a.get_offset();
uint32_t off = slm_off + local_id * ELEM_SIZE;
T v = slm_scalar_load<T>(off);
// update data chunk "Y":
slm_scalar_store(off, v + BAR_MARKER1);

if (local_id == 0) {
for (int i = 0; i < BAR_EXTRA_ELEMS; i++) {
// write data chunk "Z":
slm_scalar_store((2 * LOCAL_SIZE + i) * ELEM_SIZE, (T)BAR_MARKER2);
}
}
}

int main(void) {
queue q;
auto dev = q.get_device();
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
std::cout << "force_inline=" << force_inline << "\n";
auto ctxt = q.get_context();
uint32_t size = SLM_TOTAL * NUM_WGS / ELEM_SIZE;

T *arr = malloc_shared<T>(size, dev, ctxt);

auto e = q.submit([&](handler &cgh) {
cgh.parallel_for<class Test1>(
nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE),
[=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
slm_init(SLM_IN_KERNEL);
int local_id = ndi.get_local_linear_id();
int group_id = ndi.get_group_linear_id();

// write data chunk "X":
slm_scalar_store(local_id * ELEM_SIZE, local_id);
barrier();

foo(local_id);
barrier();

bar(local_id);
barrier();

// copy data from SLM to the output for further verification
if (local_id == 0) {
uint32_t group_off = SLM_TOTAL * group_id;

for (int i = 0; i < SLM_TOTAL / ELEM_SIZE; i++) {
uint32_t slm_off = i * ELEM_SIZE;
uint32_t mem_off = group_off + slm_off;
scatter(arr, simd<uint32_t, 1>(mem_off),
simd<T, 1>(slm_scalar_load<T>(slm_off)));
}
}
});
});
e.wait();

for (int i = 0; i < NUM_WGS * SLM_TOTAL / ELEM_SIZE; i++) {
std::cout << " " << arr[i];
if ((i + 1) % 10 == 0) {
std::cout << "\n";
}
}
std::cout << "\n";
int err_cnt = 0;

for (int g = 0; g < NUM_WGS; g++) {
uint32_t group_off = SLM_TOTAL * g / ELEM_SIZE;
for (int i = 0; i < LOCAL_SIZE; i++) {
int ind = group_off + i;

// check data copied from kernel's SLM frame ("X")
auto test = arr[ind];
auto gold = i;

if (test != gold) {
if (++err_cnt < 10) {
std::cerr << "*** ERROR (X) at " << ind << ": " << test
<< " != " << gold << " (gold)\n";
}
}
// check data copied from the overlapping part of foo's and bar's SLM
// frames - "Y"
ind = ind + LOCAL_SIZE; // shift to the foo/bar SLM frame
test = arr[ind];
gold = i + BAR_MARKER1;

if (test != gold) {
if (++err_cnt < 10) {
std::cerr << "*** ERROR (Y) at " << ind << ": " << test
<< " != " << gold << " (gold)\n";
}
}
}
// now check data written by bar past the overlapping part of foo/bar SLM
// frame - "Z"
for (int i = 0; i < BAR_EXTRA_ELEMS; i++) {
int ind =
group_off + 2 /*kernel's and foo's SLM frames*/ * LOCAL_SIZE + i;
auto test = arr[ind];
auto gold = BAR_MARKER2;

if (test != gold) {
if (++err_cnt < 10) {
std::cerr << "*** ERROR (Z) at " << ind << ": " << test
<< " != " << gold << " (gold)\n";
}
}
}
}
std::cout << (err_cnt ? "FAILED\n" : "Passed\n");
return err_cnt ? 1 : 0;
}
204 changes: 204 additions & 0 deletions SYCL/ESIMD/slm_alloc_many_kernels_many_funcs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
// REQUIRES: gpu
// UNSUPPORTED: cuda || hip
//
// RUN: %clangxx -fsycl %s -o %t.1.out
// RUN: %GPU_RUN_PLACEHOLDER %t.1.out
//
// Vary the test case by forcing inlining of the functions with slm_allocator:
// RUN: %clangxx -fsycl -DFORCE_INLINE %s -o %t.2.out
// RUN: %GPU_RUN_PLACEHOLDER %t.2.out

// Checks validity of SLM frame offsets in case of complex call graph with two
// kernels and 2 functions all using SLM, and one of the functions using two
// slm_allocator objects with nested liveness ranges.

// Hierarchy of SLM frames:
// N1 N2
// | \ |
// | \ |
// v \ |
// X \ |
// | \ \ |
// | \ \|
// v \---v
// Y Z
//
// SLM offsets are expected to be:
// 1) no inlining case:
// --- Kernel0
// X - N1
// Y - N1 + X
// Z - max(N1 + X, N2)
// --- Kernel2
// X - 0 (not reachable, offset not updated in the result)
// Y - 0 (not reachable, offset not updated in the result)
// Z - max(N1 + X, N2)
// 2) forced inlining case:
// --- Kernel0
// X - N1
// Y - N1 + X
// Z - N1 // this is because Z (bar) is inlined into X (foo) and into
// // N1 (kernel1), and execution of the second inlined scope
// // allocation and offset recording into the result happens last.
// --- Kernel2
// X - 0 (not reachable, offset not updated in the result)
// Y - 0 (not reachable, offset not updated in the result)
// Z - N2
// Note the difference in SLM offset for Z in the inlining/no-inlining cases for
// Z scope. This is because inlining effectively splits Z scope when inlining
// into kernel0 and kernel2

#include <sycl/ext/intel/esimd.hpp>
#include <sycl/sycl.hpp>

#include <cstring>
#include <iostream>

using namespace sycl;
using namespace sycl::ext::intel::esimd;
using namespace sycl::ext::intel::experimental::esimd;

using T = uint32_t;

constexpr int SLM_N1 = 7;
constexpr int SLM_N2 = 1;
constexpr int SLM_X = 8;
constexpr int SLM_Y = 16;
constexpr int SLM_Z = 4;

constexpr int LOCAL_SIZE = 2;
constexpr int GLOBAL_SIZE = 2;

template <class T> void scalar_store(T *base, uint32_t off, T val) {
scatter<T, 1>(base, simd<uint32_t, 1>(off * sizeof(T)), val);
}

// Result array format
// |---- kernel0 ----| |---- kernel2 ----|
// x_off, y_off, z_off, x_off, y_off, z_off

// Offsets in the result sub-array, to store each checked SLM frame offset at.
enum { x_off_ind, y_off_ind, z_off_ind, num_offs };

// Offsets of sub-arrays
enum { kernel0_base = x_off_ind, kernel1_base = num_offs };

#define STORE_SLM_OFF(ID, off) \
if (local_id == 0) { \
scalar_store(out, base + ID##_off_ind, off); \
}

#ifdef FORCE_INLINE
constexpr bool force_inline = true;
#define INLINE_CTL inline __attribute__((always_inline))
#else
constexpr bool force_inline = false;
#define INLINE_CTL __attribute__((noinline))
#endif // FORCE_INLINE

INLINE_CTL void bar(int local_id, T *out, unsigned base) {
slm_allocator<SLM_Z> a;
unsigned z_off = a.get_offset();
STORE_SLM_OFF(z, z_off);
}

INLINE_CTL void foo(int local_id, T *out, unsigned base) {
slm_allocator<SLM_X> a;
unsigned x_off = a.get_offset();
STORE_SLM_OFF(x, x_off);
bar(local_id, out, base);
{
slm_allocator<SLM_Y> b;
unsigned y_off = b.get_offset();
STORE_SLM_OFF(y, y_off);
}
}

int main(void) {
queue q;
auto dev = q.get_device();
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
std::cout << "force_inline=" << force_inline << "\n";
auto ctxt = q.get_context();

constexpr int num_kernels = 2;
T *arr = malloc_shared<T>(num_kernels * num_offs, dev, ctxt);
std::memset(arr, 0, num_kernels * num_offs * sizeof(T));

auto e = q.submit([&](handler &cgh) {
cgh.parallel_for<class Kernel0>(nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE),
[=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
slm_init(SLM_N1);
int local_id = ndi.get_local_linear_id();
foo(local_id, arr, kernel0_base);
bar(local_id, arr, kernel0_base);
});
});
e.wait();

e = q.submit([&](handler &cgh) {
cgh.parallel_for<class Kernel2>(nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE),
[=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
slm_init(SLM_N2);
int local_id = ndi.get_local_linear_id();
bar(local_id, arr, kernel1_base);
});
});
e.wait();

T gold_arr[num_kernels * num_offs];
T *gold_arr0 = &gold_arr[kernel0_base];
T *gold_arr1 = &gold_arr[kernel1_base];

// For kernel0 inline/no-inline results are the same for X and Y:
// X - N1
// Y - N1 + X
// Z - max(N1 + X, N2)
gold_arr0[x_off_ind] = SLM_N1;
gold_arr0[y_off_ind] = SLM_N1 + SLM_X;

// For kernel0 inline/no-inline case splits for Z:
#ifdef FORCE_INLINE
gold_arr0[z_off_ind] = SLM_N1;
#else
gold_arr0[z_off_ind] = std::max(SLM_N1 + SLM_X, SLM_N2);
#endif // FORCE_INLINE

// For kernel1 inline/no-inline results are the same for X and Y:
// X - 0
// Y - 0
gold_arr1[x_off_ind] = 0;
gold_arr1[y_off_ind] = 0;

// For kernel1 inline/no-inline case splits for Z:
#ifdef FORCE_INLINE
gold_arr1[z_off_ind] = SLM_N2;
#else
gold_arr1[z_off_ind] = std::max(SLM_N1 + SLM_X, SLM_N2);
#endif // FORCE_INLINE

T *test_arr = arr;
int err_cnt = 0;

T kernel_bases[num_kernels] = {kernel0_base, kernel1_base};

for (int k = 0; k < num_kernels; k++) {
std::cout << "Kernel " << k << "\n";

for (int i = 0; i < num_offs; i++) {
T test = test_arr[kernel_bases[k] + i];
T gold = gold_arr[kernel_bases[k] + i];

if (test != gold) {
++err_cnt;
std::cerr << " *** ERROR at [" << i << "]: " << test << " != " << gold
<< "(gold)\n";
} else {
std::cout << " [" << i << "]: " << test << " == " << gold
<< "(gold)\n";
}
}
}
std::cout << (err_cnt ? "FAILED\n" : "Passed\n");
return err_cnt ? 1 : 0;
}
Loading