This repository was archived by the owner on Mar 28, 2023. It is now read-only.
forked from llvm/llvm-test-suite
-
Notifications
You must be signed in to change notification settings - Fork 130
[ESIMD] E2E test for slm_allocator API. #1449
Merged
Merged
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
dcc8ec1
[ESIMD] E2E test for slm_allocator API.
kbobrovs fdea25f
Apply suggestions from code review
kbobrovs cfb2dcb
update to new slm_allocator interface with SLM amount as template arg…
kbobrovs 5a19d0c
Merge branch 'dyn_slm_alloc' of https://github.com/kbobrovs/llvm-test…
kbobrovs 4033106
clang-format
kbobrovs 9e0b2b2
add force_inline mode display
kbobrovs d3e6f56
add more test cases
kbobrovs d5ce3a6
Update SYCL/ESIMD/slm_alloc.cpp
kbobrovs File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
// REQUIRES: gpu | ||
// UNSUPPORTED: cuda || hip | ||
// | ||
// RUN: %clangxx -fsycl %s -o %t.1.out | ||
// RUN: %GPU_RUN_PLACEHOLDER %t.1.out | ||
// | ||
// Vary the test case by forcing inlining of the functions with slm_allocator: | ||
// RUN: %clangxx -fsycl -DFORCE_INLINE %s -o %t.2.out | ||
// RUN: %GPU_RUN_PLACEHOLDER %t.2.out | ||
|
||
// This is end-to-end test for the slm_allocator API used together with the | ||
// slm_init. The call graph is: | ||
// Test1(kernel) - uses slm_init(SLM_IN_KERNEL) | ||
// / \ | ||
// / v | ||
// / bar - uses slm_allocator(SLM_IN_BAR) | ||
// v | ||
// foo - uses slm_allocator(SLM_IN_FOO) | ||
// Test1 kernel SLM usage is SLM_IN_KERNEL + max(SLM_IN_BAR, SLM_IN_FOO). | ||
// SLM offset returned by the slm_allocator in foo and bar is the same and is | ||
// SLM_IN_KERNEL bytes. | ||
// Bar uses slightly bigger SLM frame than foo. It modifies values (adds 10) in | ||
// SLM resulting from foo, plus appends couple more '100's. | ||
|
||
#include <iostream> | ||
#include <sycl/ext/intel/esimd.hpp> | ||
#include <sycl/sycl.hpp> | ||
|
||
using namespace sycl; | ||
using namespace sycl::ext::intel::esimd; | ||
using namespace sycl::ext::intel::experimental::esimd; | ||
|
||
using T = int; | ||
constexpr int LOCAL_SIZE = 4; | ||
constexpr int GLOBAL_SIZE = 8; | ||
|
||
constexpr int NUM_WGS = GLOBAL_SIZE / LOCAL_SIZE; | ||
|
||
constexpr int ELEM_SIZE = sizeof(T); | ||
|
||
constexpr int SLM_IN_KERNEL = LOCAL_SIZE * ELEM_SIZE; | ||
constexpr int SLM_IN_FOO = LOCAL_SIZE * ELEM_SIZE; | ||
constexpr int BAR_EXTRA_ELEMS = 2; | ||
constexpr int SLM_IN_BAR = SLM_IN_FOO + BAR_EXTRA_ELEMS * ELEM_SIZE; | ||
constexpr int SLM_TOTAL = SLM_IN_KERNEL + std::max(SLM_IN_FOO, SLM_IN_BAR); | ||
constexpr int BAR_MARKER1 = 10; | ||
constexpr int BAR_MARKER2 = 100; | ||
|
||
#ifdef FORCE_INLINE | ||
constexpr bool force_inline = true; | ||
inline | ||
kbobrovs marked this conversation as resolved.
Show resolved
Hide resolved
|
||
__attribute__((always_inline)) | ||
#else | ||
constexpr bool force_inline = false; | ||
__attribute__((noinline)) | ||
#endif // FORCE_INLINE | ||
void | ||
foo(int local_id) { | ||
slm_allocator<SLM_IN_FOO> a; | ||
uint32_t slm_off = a.get_offset(); | ||
// write data chunk "Y": | ||
slm_scalar_store(slm_off + local_id * ELEM_SIZE, (T)local_id); | ||
} | ||
|
||
#ifdef FORCE_INLINE | ||
inline | ||
kbobrovs marked this conversation as resolved.
Show resolved
Hide resolved
|
||
__attribute__((always_inline)) | ||
#else | ||
__attribute__((noinline)) | ||
#endif // FORCE_INLINE | ||
void | ||
bar(int local_id) { | ||
slm_allocator<SLM_IN_BAR> a; | ||
uint32_t slm_off = a.get_offset(); | ||
uint32_t off = slm_off + local_id * ELEM_SIZE; | ||
T v = slm_scalar_load<T>(off); | ||
// update data chunk "Y": | ||
slm_scalar_store(off, v + BAR_MARKER1); | ||
|
||
if (local_id == 0) { | ||
for (int i = 0; i < BAR_EXTRA_ELEMS; i++) { | ||
// write data chunk "Z": | ||
slm_scalar_store((2 * LOCAL_SIZE + i) * ELEM_SIZE, (T)BAR_MARKER2); | ||
} | ||
} | ||
} | ||
|
||
int main(void) { | ||
queue q; | ||
auto dev = q.get_device(); | ||
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n"; | ||
std::cout << "force_inline=" << force_inline << "\n"; | ||
auto ctxt = q.get_context(); | ||
uint32_t size = SLM_TOTAL * NUM_WGS / ELEM_SIZE; | ||
|
||
T *arr = malloc_shared<T>(size, dev, ctxt); | ||
|
||
auto e = q.submit([&](handler &cgh) { | ||
cgh.parallel_for<class Test1>( | ||
nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE), | ||
[=](nd_item<1> ndi) SYCL_ESIMD_KERNEL { | ||
slm_init(SLM_IN_KERNEL); | ||
int local_id = ndi.get_local_linear_id(); | ||
int group_id = ndi.get_group_linear_id(); | ||
|
||
// write data chunk "X": | ||
slm_scalar_store(local_id * ELEM_SIZE, local_id); | ||
barrier(); | ||
|
||
foo(local_id); | ||
barrier(); | ||
|
||
bar(local_id); | ||
barrier(); | ||
|
||
// copy data from SLM to the output for further verification | ||
if (local_id == 0) { | ||
uint32_t group_off = SLM_TOTAL * group_id; | ||
|
||
for (int i = 0; i < SLM_TOTAL / ELEM_SIZE; i++) { | ||
uint32_t slm_off = i * ELEM_SIZE; | ||
uint32_t mem_off = group_off + slm_off; | ||
scatter(arr, simd<uint32_t, 1>(mem_off), | ||
simd<T, 1>(slm_scalar_load<T>(slm_off))); | ||
} | ||
} | ||
}); | ||
}); | ||
e.wait(); | ||
|
||
for (int i = 0; i < NUM_WGS * SLM_TOTAL / ELEM_SIZE; i++) { | ||
std::cout << " " << arr[i]; | ||
if ((i + 1) % 10 == 0) { | ||
std::cout << "\n"; | ||
} | ||
} | ||
std::cout << "\n"; | ||
int err_cnt = 0; | ||
|
||
for (int g = 0; g < NUM_WGS; g++) { | ||
uint32_t group_off = SLM_TOTAL * g / ELEM_SIZE; | ||
for (int i = 0; i < LOCAL_SIZE; i++) { | ||
int ind = group_off + i; | ||
|
||
// check data copied from kernel's SLM frame ("X") | ||
auto test = arr[ind]; | ||
auto gold = i; | ||
|
||
if (test != gold) { | ||
if (++err_cnt < 10) { | ||
std::cerr << "*** ERROR (X) at " << ind << ": " << test | ||
<< " != " << gold << " (gold)\n"; | ||
} | ||
} | ||
// check data copied from the overlapping part of foo's and bar's SLM | ||
// frames - "Y" | ||
ind = ind + LOCAL_SIZE; // shift to the foo/bar SLM frame | ||
test = arr[ind]; | ||
gold = i + BAR_MARKER1; | ||
|
||
if (test != gold) { | ||
if (++err_cnt < 10) { | ||
std::cerr << "*** ERROR (Y) at " << ind << ": " << test | ||
<< " != " << gold << " (gold)\n"; | ||
} | ||
} | ||
} | ||
// now check data written by bar past the overlapping part of foo/bar SLM | ||
// frame - "Z" | ||
for (int i = 0; i < BAR_EXTRA_ELEMS; i++) { | ||
int ind = | ||
group_off + 2 /*kernel's and foo's SLM frames*/ * LOCAL_SIZE + i; | ||
auto test = arr[ind]; | ||
auto gold = BAR_MARKER2; | ||
|
||
if (test != gold) { | ||
if (++err_cnt < 10) { | ||
std::cerr << "*** ERROR (Z) at " << ind << ": " << test | ||
<< " != " << gold << " (gold)\n"; | ||
} | ||
} | ||
} | ||
} | ||
std::cout << (err_cnt ? "FAILED\n" : "Passed\n"); | ||
return err_cnt ? 1 : 0; | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
// REQUIRES: gpu | ||
// UNSUPPORTED: cuda || hip | ||
// | ||
// RUN: %clangxx -fsycl %s -o %t.1.out | ||
// RUN: %GPU_RUN_PLACEHOLDER %t.1.out | ||
// | ||
// Vary the test case by forcing inlining of the functions with slm_allocator: | ||
// RUN: %clangxx -fsycl -DFORCE_INLINE %s -o %t.2.out | ||
// RUN: %GPU_RUN_PLACEHOLDER %t.2.out | ||
|
||
// Checks validity of SLM frame offsets in case of complex call graph with two | ||
// kernels and 2 functions all using SLM, and one of the functions using two | ||
// slm_allocator objects with nested liveness ranges. | ||
|
||
// Hierarchy of SLM frames: | ||
// N1 N2 | ||
// | \ | | ||
// | \ | | ||
// v \ | | ||
// X \ | | ||
// | \ \ | | ||
// | \ \| | ||
// v \---v | ||
// Y Z | ||
// | ||
// SLM offsets are expected to be: | ||
// 1) no inlining case: | ||
// --- Kernel0 | ||
// X - N1 | ||
// Y - N1 + X | ||
// Z - max(N1 + X, N2) | ||
// --- Kernel2 | ||
// X - 0 (not reachable, offset not updated in the result) | ||
// Y - 0 (not reachable, offset not updated in the result) | ||
// Z - max(N1 + X, N2) | ||
// 2) forced inlining case: | ||
// --- Kernel0 | ||
// X - N1 | ||
// Y - N1 + X | ||
// Z - N1 // this is because Z (bar) is inlined into X (foo) and into | ||
// // N1 (kernel1), and execution of the second inlined scope | ||
// // allocation and offset recording into the result happens last. | ||
// --- Kernel2 | ||
// X - 0 (not reachable, offset not updated in the result) | ||
// Y - 0 (not reachable, offset not updated in the result) | ||
// Z - N2 | ||
// Note the difference in SLM offset for Z in the inlining/no-inlining cases for | ||
// Z scope. This is because inlining effectively splits Z scope when inlining | ||
// into kernel0 and kernel2 | ||
|
||
#include <sycl/ext/intel/esimd.hpp> | ||
#include <sycl/sycl.hpp> | ||
|
||
#include <cstring> | ||
#include <iostream> | ||
|
||
using namespace sycl; | ||
using namespace sycl::ext::intel::esimd; | ||
using namespace sycl::ext::intel::experimental::esimd; | ||
|
||
using T = uint32_t; | ||
|
||
constexpr int SLM_N1 = 7; | ||
constexpr int SLM_N2 = 1; | ||
constexpr int SLM_X = 8; | ||
constexpr int SLM_Y = 16; | ||
constexpr int SLM_Z = 4; | ||
|
||
constexpr int LOCAL_SIZE = 2; | ||
constexpr int GLOBAL_SIZE = 2; | ||
|
||
template <class T> void scalar_store(T *base, uint32_t off, T val) { | ||
scatter<T, 1>(base, simd<uint32_t, 1>(off * sizeof(T)), val); | ||
} | ||
|
||
// Result array format | ||
// |---- kernel0 ----| |---- kernel2 ----| | ||
// x_off, y_off, z_off, x_off, y_off, z_off | ||
|
||
// Offsets in the result sub-array, to store each checked SLM frame offset at. | ||
enum { x_off_ind, y_off_ind, z_off_ind, num_offs }; | ||
|
||
// Offsets of sub-arrays | ||
enum { kernel0_base = x_off_ind, kernel1_base = num_offs }; | ||
|
||
#define STORE_SLM_OFF(ID, off) \ | ||
if (local_id == 0) { \ | ||
scalar_store(out, base + ID##_off_ind, off); \ | ||
} | ||
|
||
#ifdef FORCE_INLINE | ||
constexpr bool force_inline = true; | ||
#define INLINE_CTL inline __attribute__((always_inline)) | ||
#else | ||
constexpr bool force_inline = false; | ||
#define INLINE_CTL __attribute__((noinline)) | ||
#endif // FORCE_INLINE | ||
|
||
INLINE_CTL void bar(int local_id, T *out, unsigned base) { | ||
slm_allocator<SLM_Z> a; | ||
unsigned z_off = a.get_offset(); | ||
STORE_SLM_OFF(z, z_off); | ||
} | ||
|
||
INLINE_CTL void foo(int local_id, T *out, unsigned base) { | ||
slm_allocator<SLM_X> a; | ||
unsigned x_off = a.get_offset(); | ||
STORE_SLM_OFF(x, x_off); | ||
bar(local_id, out, base); | ||
{ | ||
slm_allocator<SLM_Y> b; | ||
unsigned y_off = b.get_offset(); | ||
STORE_SLM_OFF(y, y_off); | ||
} | ||
} | ||
|
||
int main(void) { | ||
queue q; | ||
auto dev = q.get_device(); | ||
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n"; | ||
std::cout << "force_inline=" << force_inline << "\n"; | ||
auto ctxt = q.get_context(); | ||
|
||
constexpr int num_kernels = 2; | ||
T *arr = malloc_shared<T>(num_kernels * num_offs, dev, ctxt); | ||
std::memset(arr, 0, num_kernels * num_offs * sizeof(T)); | ||
|
||
auto e = q.submit([&](handler &cgh) { | ||
cgh.parallel_for<class Kernel0>(nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE), | ||
[=](nd_item<1> ndi) SYCL_ESIMD_KERNEL { | ||
slm_init(SLM_N1); | ||
int local_id = ndi.get_local_linear_id(); | ||
foo(local_id, arr, kernel0_base); | ||
bar(local_id, arr, kernel0_base); | ||
}); | ||
}); | ||
e.wait(); | ||
|
||
e = q.submit([&](handler &cgh) { | ||
cgh.parallel_for<class Kernel2>(nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE), | ||
[=](nd_item<1> ndi) SYCL_ESIMD_KERNEL { | ||
slm_init(SLM_N2); | ||
int local_id = ndi.get_local_linear_id(); | ||
bar(local_id, arr, kernel1_base); | ||
}); | ||
}); | ||
e.wait(); | ||
|
||
T gold_arr[num_kernels * num_offs]; | ||
T *gold_arr0 = &gold_arr[kernel0_base]; | ||
T *gold_arr1 = &gold_arr[kernel1_base]; | ||
|
||
// For kernel0 inline/no-inline results are the same for X and Y: | ||
// X - N1 | ||
// Y - N1 + X | ||
// Z - max(N1 + X, N2) | ||
gold_arr0[x_off_ind] = SLM_N1; | ||
gold_arr0[y_off_ind] = SLM_N1 + SLM_X; | ||
|
||
// For kernel0 inline/no-inline case splits for Z: | ||
#ifdef FORCE_INLINE | ||
gold_arr0[z_off_ind] = SLM_N1; | ||
#else | ||
gold_arr0[z_off_ind] = std::max(SLM_N1 + SLM_X, SLM_N2); | ||
#endif // FORCE_INLINE | ||
|
||
// For kernel1 inline/no-inline results are the same for X and Y: | ||
// X - 0 | ||
// Y - 0 | ||
gold_arr1[x_off_ind] = 0; | ||
gold_arr1[y_off_ind] = 0; | ||
|
||
// For kernel1 inline/no-inline case splits for Z: | ||
#ifdef FORCE_INLINE | ||
gold_arr1[z_off_ind] = SLM_N2; | ||
#else | ||
gold_arr1[z_off_ind] = std::max(SLM_N1 + SLM_X, SLM_N2); | ||
#endif // FORCE_INLINE | ||
|
||
T *test_arr = arr; | ||
int err_cnt = 0; | ||
|
||
T kernel_bases[num_kernels] = {kernel0_base, kernel1_base}; | ||
|
||
for (int k = 0; k < num_kernels; k++) { | ||
std::cout << "Kernel " << k << "\n"; | ||
|
||
for (int i = 0; i < num_offs; i++) { | ||
T test = test_arr[kernel_bases[k] + i]; | ||
T gold = gold_arr[kernel_bases[k] + i]; | ||
|
||
if (test != gold) { | ||
++err_cnt; | ||
std::cerr << " *** ERROR at [" << i << "]: " << test << " != " << gold | ||
<< "(gold)\n"; | ||
} else { | ||
std::cout << " [" << i << "]: " << test << " == " << gold | ||
<< "(gold)\n"; | ||
} | ||
} | ||
} | ||
std::cout << (err_cnt ? "FAILED\n" : "Passed\n"); | ||
return err_cnt ? 1 : 0; | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.