Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 32e8731

Browse files
authored
[SYCL] Add 'regression' tests to SYCL/InvokeSimd (#1561)
1 parent 1e338c0 commit 32e8731

18 files changed

+1826
-0
lines changed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// TODO: enable on Windows once driver is ready
2+
// REQUIRES: gpu && linux
3+
// UNSUPPORTED: cuda || hip
4+
//
5+
// Check that full compilation works:
6+
// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../call_vadd_1d_loop.cpp -o %t.out
7+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
8+
//
9+
// VISALTO enable run
10+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
12+
/*
13+
* This tests is the same as InvokeSimd/regression/call_vadd_1d_loop.cpp, but
14+
* compiles without optional subgroup attribute specified and intended to check
15+
* that compiler is able to choose subgroup size correctly.
16+
*/
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// TODO: enable on Windows once driver is ready
2+
// REQUIRES: gpu && linux
3+
// UNSUPPORTED: cuda || hip
4+
//
5+
// TODO: enable when Jira ticket resolved
6+
// XFAIL: gpu-intel-pvc
7+
//
8+
// Check that full compilation works:
9+
// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../call_vadd_1d_loop_naive.cpp -o %t.out
10+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
//
12+
// VISALTO enable run
13+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
14+
15+
/*
16+
* This tests is the same as InvokeSimd/regression/call_vadd_1d_loop_naive.cpp,
17+
* but compiles without optional subgroup attribute specified and intended to
18+
* check that compiler is able to choose subgroup size correctly.
19+
*/
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// TODO: enable on Windows once driver is ready
2+
// REQUIRES: gpu && linux
3+
// UNSUPPORTED: cuda || hip
4+
//
5+
// Check that full compilation works:
6+
// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../call_vadd_1d_spill.cpp -o %t.out
7+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
8+
//
9+
// VISALTO enable run
10+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
12+
/*
13+
* This tests is the same as InvokeSimd/regression/call_vadd_1d_spill.cpp, but
14+
* compiles without optional subgroup attribute specified and intended to check
15+
* that compiler is able to choose subgroup size correctly.
16+
*/
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// TODO: enable on Windows once driver is ready
2+
// REQUIRES: gpu && linux
3+
// UNSUPPORTED: cuda || hip
4+
//
5+
// TODO: enable when Jira ticket resolved
6+
// XFAIL: gpu
7+
//
8+
// Check that full compilation works:
9+
// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -g %S/../debug_symbols.cpp -o %t.out
10+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
//
12+
// VISALTO enable run
13+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
14+
15+
/*
16+
* This tests is the same as InvokeSimd/regression/debug_symbols.cpp, but
17+
* compiles without optional subgroup attribute specified and intended to check
18+
* that compiler is able to choose subgroup size correctly.
19+
*/
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// TODO: enable on Windows once driver is ready
2+
// REQUIRES: gpu && linux
3+
// UNSUPPORTED: cuda || hip
4+
// REQUIRES: gpu-intel-dg1 || gpu-intel-dg2 || gpu-intel-pvc
5+
//
6+
// Check that full compilation works:
7+
// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../dp4a.cpp -o %t.out
8+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
9+
//
10+
// VISALTO enable run
11+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
12+
13+
/*
14+
* This tests is the same as InvokeSimd/regression/dp4a.cpp, but compiles
15+
* without optional subgroup attribute specified and intended to check that
16+
* compiler is able to choose subgroup size correctly.
17+
*/
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// TODO: enable on Windows once driver is ready
2+
// REQUIRES: gpu && linux
3+
// UNSUPPORTED: cuda || hip
4+
//
5+
// Check that full compilation works:
6+
// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../matrix_add.cpp -o %t.out
7+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
8+
//
9+
// VISALTO enable run
10+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
12+
/*
13+
* This tests is the same as InvokeSimd/regression/matrix_add.cpp, but compiles
14+
* without optional subgroup attribute specified and intended to check that
15+
* compiler is able to choose subgroup size correctly.
16+
*/
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// TODO: enable on Windows once driver is ready
2+
// REQUIRES: gpu && linux
3+
// UNSUPPORTED: cuda || hip
4+
//
5+
// Check that full compilation works:
6+
// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../matrix_multiply_USM.cpp -o %t.out
7+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
8+
//
9+
// VISALTO enable run
10+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
12+
/*
13+
* This tests is the same as InvokeSimd/regression/matrix_multiply_USM.cpp, but
14+
* compiles without optional subgroup attribute specified and intended to check
15+
* that compiler is able to choose subgroup size correctly.
16+
*/
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// TODO: enable on Windows once driver is ready
2+
// REQUIRES: gpu && linux
3+
// UNSUPPORTED: cuda || hip
4+
//
5+
// Check that full compilation works:
6+
// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../matrix_multiply_accessor_get_pointer.cpp -o %t.out
7+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
8+
//
9+
// VISALTO enable run
10+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
12+
/*
13+
* This tests is the same as
14+
* InvokeSimd/regression/matrix_multiply_accessor_get_pointer.cpp, but compiles
15+
* without optional subgroup attribute specified and intended to check that
16+
* compiler is able to choose subgroup size correctly.
17+
*/
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// TODO: enable on Windows once driver is ready
2+
// REQUIRES: gpu && linux
3+
// UNSUPPORTED: cuda || hip
4+
//
5+
// Check that full compilation works:
6+
// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../tiled_matrix_multiplication.cpp -o %t.out
7+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
8+
//
9+
// VISALTO enable run
10+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
12+
/*
13+
* This tests is the same as
14+
* InvokeSimd/regression/tiled_matrix_multiplication.cpp, but compiles without
15+
* optional subgroup attribute specified and intended to check that compiler is
16+
* able to choose subgroup size correctly.
17+
*/
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
// TODO: enable on Windows once driver is ready
2+
// REQUIRES: gpu && linux
3+
// UNSUPPORTED: cuda || hip
4+
//
5+
// TODO: enable when Jira ticket resolved
6+
// XFAIL: gpu-intel-pvc
7+
//
8+
// Check that full compilation works:
9+
// RUN: %clangxx -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %s -o %t.out
10+
// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
11+
//
12+
// VISALTO enable run
13+
// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out
14+
15+
// Tests invoke_simd support in the compiler/headers
16+
17+
/* This program tests loop functionality. The key parts of this program
18+
* are:
19+
* 1). The global execution range is divided/reduced by VL
20+
* 2). Each work-item must do VL times more work
21+
* 3). invoke_simd functionality is integrated into this test case
22+
* by accumulating wi_ids into simds instead of vector elements.
23+
* There is thus an extra looping step that must occur in order to
24+
* perform the vector additions: each simd of wi_ids must be
25+
* looped through and each wi_id extracted. Then, each wi_id
26+
* is used to compute an absolute offset into the underlying vectors
27+
* A, B, and C. That is, each ESIMD function call actually performs
28+
* VL vector additions.
29+
*
30+
* This test also runs with all types of VISA link time optimizations enabled.
31+
*/
32+
33+
#include <sycl/ext/intel/esimd.hpp>
34+
#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
35+
#include <sycl/sycl.hpp>
36+
37+
#include <functional>
38+
#include <iostream>
39+
#include <type_traits>
40+
41+
/* Subgroup size attribute is optional
42+
* In case it is absent compiler decides what subgroup size to use
43+
*/
44+
#ifdef IMPL_SUBGROUP
45+
#define SUBGROUP_ATTR
46+
#else
47+
#define SUBGROUP_ATTR [[intel::reqd_sub_group_size(VL)]]
48+
#endif
49+
50+
using namespace sycl::ext::oneapi::experimental;
51+
namespace esimd = sycl::ext::intel::esimd;
52+
constexpr int VL = 16;
53+
54+
__attribute__((always_inline)) esimd::simd<float, VL>
55+
ESIMD_CALLEE(float *A, float *B, float *C,
56+
esimd::simd<int, VL> indices) SYCL_ESIMD_FUNCTION {
57+
// Loop through the indices:
58+
// Extract each index and use it to load 16 underlying vector elements (from A
59+
// and B), perform the vector addition of these elements, and store the result
60+
// in the correct location in C.
61+
for (int i = 0; i < VL; ++i) {
62+
int index = indices[i];
63+
int absolute_index = index * VL;
64+
esimd::simd<float, VL> a, b;
65+
a.copy_from(A + absolute_index);
66+
b.copy_from(B + absolute_index);
67+
esimd::simd<float, VL> c = a + b;
68+
c.copy_to(C + absolute_index);
69+
}
70+
71+
// The current implementation requires us to return something here; so we
72+
// return a dummy, which is simply ignored by the caller.
73+
return esimd::simd<float, VL>();
74+
}
75+
76+
[[intel::device_indirectly_callable]] SYCL_EXTERNAL
77+
simd<float, VL> __regcall SIMD_CALLEE(float *A, float *B, float *C,
78+
simd<int, VL> indices)
79+
SYCL_ESIMD_FUNCTION;
80+
81+
void SPMD_doVadd(float va[VL], float vb[VL], float vc[VL]) {
82+
for (int i = 0; i < VL; i++) {
83+
vc[i] = va[i] + vb[i];
84+
}
85+
}
86+
87+
using namespace sycl;
88+
89+
constexpr bool use_invoke_simd = true;
90+
91+
int main(void) {
92+
constexpr unsigned Size = 1024;
93+
constexpr unsigned GroupSize = 4 * VL;
94+
95+
auto q = queue{gpu_selector_v};
96+
auto dev = q.get_device();
97+
std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
98+
<< "\n";
99+
auto ctxt = q.get_context();
100+
101+
float *A =
102+
static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
103+
float *B =
104+
static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
105+
float *C =
106+
static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
107+
108+
for (unsigned i = 0; i < Size; ++i) {
109+
A[i] = B[i] = i;
110+
C[i] = -1;
111+
}
112+
113+
sycl::range<1> GlobalRange{Size / VL};
114+
// Number of workitems in each workgroup.
115+
sycl::range<1> LocalRange{GroupSize};
116+
117+
sycl::nd_range<1> Range(GlobalRange, LocalRange);
118+
119+
try {
120+
auto e = q.submit([&](handler &cgh) {
121+
cgh.parallel_for<class Test>(Range, [=](nd_item<1> ndi) SUBGROUP_ATTR {
122+
sub_group sg = ndi.get_sub_group();
123+
group<1> g = ndi.get_group();
124+
uint32_t i =
125+
sg.get_group_linear_id() * VL + g.get_group_linear_id() * GroupSize;
126+
int wi_id = i + sg.get_local_id();
127+
128+
if constexpr (use_invoke_simd) {
129+
float res = invoke_simd(sg, SIMD_CALLEE, uniform{A}, uniform{B},
130+
uniform{C}, wi_id);
131+
} else {
132+
// NOTE: This kernel will be instantiated for each index in the
133+
// GlobalRange, which is currently 1024 / 16 = 64. However, in
134+
// this loop implementation, we do not want to do an SPMD-style
135+
// vector addition on each individual workitem in the global
136+
// execution range, rather we want to simulate an SIMD-style
137+
// vector addition using standard arrays of VL workitems, but only
138+
// processing every VLth index.
139+
unsigned int offset = ndi.get_global_id(0) * VL;
140+
float va[VL], vb[VL], vc[VL];
141+
// Load input vectors A and B.
142+
for (int k = 0; k < VL; k++) {
143+
va[k] = A[offset + k];
144+
vb[k] = B[offset + k];
145+
}
146+
147+
SPMD_doVadd(va, vb, vc);
148+
for (int k = 0; k < VL; k++) {
149+
C[offset + k] = vc[k];
150+
}
151+
}
152+
});
153+
});
154+
e.wait();
155+
} catch (sycl::exception const &e) {
156+
sycl::free(A, q);
157+
sycl::free(B, q);
158+
sycl::free(C, q);
159+
std::cout << "SYCL exception caught: " << e.what() << '\n';
160+
return e.code().value();
161+
}
162+
163+
int err_cnt = 0;
164+
165+
for (unsigned i = 0; i < Size; ++i) {
166+
if (A[i] + B[i] != C[i]) {
167+
if (++err_cnt < 10) {
168+
std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
169+
<< " + " << B[i] << "\n";
170+
}
171+
}
172+
}
173+
if (err_cnt > 0) {
174+
std::cout << " pass rate: "
175+
<< ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
176+
<< (Size - err_cnt) << "/" << Size << ")\n";
177+
}
178+
179+
sycl::free(A, q);
180+
sycl::free(B, q);
181+
sycl::free(C, q);
182+
183+
std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
184+
return err_cnt > 0 ? 1 : 0;
185+
}
186+
187+
[[intel::device_indirectly_callable]] SYCL_EXTERNAL
188+
simd<float, VL> __regcall SIMD_CALLEE(float *A, float *B, float *C,
189+
simd<int, VL> indices)
190+
SYCL_ESIMD_FUNCTION {
191+
esimd::simd<float, VL> res = ESIMD_CALLEE(A, B, C, indices);
192+
return res;
193+
}

0 commit comments

Comments
 (0)