Skip to content

Commit 3bcf986

Browse files
authored
Merge pull request intel#427 from fveselov/nnvgse259-lsc_testroot-topic
Add more LSC load/store/prefetch tests
2 parents beba925 + 0dd8354 commit 3bcf986

File tree

62 files changed

+1469
-156
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+1469
-156
lines changed
Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
#include <CL/sycl.hpp>
2+
#include <sycl/ext/intel/experimental/esimd.hpp>
3+
4+
#include <iostream>
5+
6+
#include "common.hpp"
7+
8+
using namespace cl::sycl;
9+
using namespace sycl::ext::intel::experimental::esimd;
10+
using namespace sycl::ext::intel::experimental::esimd::detail;
11+
12+
template <int case_num, typename T, uint32_t Groups, uint32_t Threads,
13+
int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
14+
bool Transposed = false, bool Transformed = false,
15+
CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None,
16+
bool use_prefetch = false>
17+
bool test(unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch,
18+
int X, int Y) {
19+
// Some restrictions based on documentation
20+
static_assert(!(Transposed && Transformed),
21+
"Transposed and transformed is not supported");
22+
static_assert(BlockWidth > 0, "Block width must be positive");
23+
static_assert(BlockHeight > 0, "Block height must be positive");
24+
25+
if constexpr (Transposed) {
26+
static_assert(NBlocks == 1, "Transposed expected to be 1 block only");
27+
static_assert(sizeof(T) >= 4, "Transposed can only use D32 and D64");
28+
if constexpr (sizeof(T) == 4) {
29+
static_assert(BlockWidth <= 8,
30+
"D32 transposed allow only block width 8 and less");
31+
static_assert(BlockHeight <= 32,
32+
"D32 transposed allow only block height 32 and less");
33+
}
34+
if constexpr (sizeof(T) == 8) {
35+
static_assert(BlockWidth == 1 || BlockWidth == 2 || BlockWidth == 4,
36+
"D64 transposed allow only block width 1/2/4");
37+
static_assert(BlockHeight == 8,
38+
"D64 transposed allow only block height 8");
39+
}
40+
} else if constexpr (Transformed) {
41+
static_assert(sizeof(T) <= 2, "Transformed can only use D8 and D16");
42+
if constexpr (sizeof(T) == 2 && NBlocks == 4) {
43+
static_assert(BlockWidth <= 8,
44+
"Transformed D16x4 allow only block width 8 and less");
45+
}
46+
static_assert((sizeof(T) * BlockWidth) % 4 == 0,
47+
"Transformed block width must be aligned by DW");
48+
static_assert(BlockWidth <= 16,
49+
"Transformed block width must be 16 and less");
50+
static_assert(BlockWidth >= (4 / sizeof(T)),
51+
"Minimal transformed block width depends on data size");
52+
static_assert(BlockHeight <= 32,
53+
"Transformed block height must be 32 and less");
54+
static_assert(BlockHeight >= (4 / sizeof(T)),
55+
"Minimal transformed block height depends on data size");
56+
} else {
57+
static_assert((sizeof(T) * BlockWidth) % 4 == 0,
58+
"Block width must be aligned by DW");
59+
static_assert(sizeof(T) * BlockWidth * NBlocks <= 64,
60+
"Total block width must be 64B or less");
61+
static_assert(BlockHeight <= 32, "Block height must be 32 or less");
62+
if constexpr (sizeof(T) == 4) {
63+
static_assert(NBlocks < 4, "D32 restricted to use 1 or 2 blocks only");
64+
}
65+
if constexpr (sizeof(T) == 8) {
66+
static_assert(NBlocks < 2, "D64 restricted to use 1 block only");
67+
}
68+
}
69+
70+
constexpr int N =
71+
get_lsc_block_2d_data_size<T, NBlocks, BlockHeight, BlockWidth,
72+
Transposed, Transformed>();
73+
/* Due to store2d a is subject to stricter restrictions:
74+
* NBlocks always 1, no Transposed, no Transformed, max BlockHeight 8.
75+
* Series of 2d stores with height 1 are used to write loaded data to output
76+
* buffer. Also Transformed load2d extends BlockWidth to the next power of 2
77+
* and rounds up BlockHeight.
78+
*/
79+
constexpr int SH = Transformed
80+
? roundUpNextMultiple(BlockHeight, 4 / sizeof(T))
81+
: BlockHeight;
82+
constexpr int SW = Transformed ? getNextPowerOf2<BlockWidth>() : BlockWidth;
83+
constexpr int SN = get_lsc_block_2d_data_size<T, 1u, 1u, SW, false, false>();
84+
85+
std::cout << "N = " << N << std::endl;
86+
std::cout << "SN = " << SN << std::endl;
87+
std::cout << "W = " << BlockWidth << " SW = " << SW << std::endl;
88+
std::cout << "H = " << BlockHeight << " SH = " << SH << std::endl;
89+
90+
T old_val = get_rand<T>();
91+
92+
auto GPUSelector = gpu_selector{};
93+
auto q = queue{GPUSelector};
94+
auto dev = q.get_device();
95+
std::cout << "Running case #" << case_num << " on "
96+
<< dev.get_info<info::device::name>() << "\n";
97+
auto ctx = q.get_context();
98+
99+
// workgroups
100+
cl::sycl::range<1> GlobalRange{Groups};
101+
// threads in each group
102+
cl::sycl::range<1> LocalRange{Threads};
103+
cl::sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
104+
105+
unsigned SurfaceSize = SurfacePitch * SurfaceHeight * NBlocks;
106+
unsigned Size = SurfaceSize * Groups * Threads;
107+
108+
T *out = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), dev, ctx));
109+
for (int i = 0; i < Size; i++)
110+
out[i] = old_val;
111+
112+
T *in = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), dev, ctx));
113+
for (int i = 0; i < Size; i++)
114+
in[i] = get_rand<T>();
115+
116+
try {
117+
auto e = q.submit([&](handler &cgh) {
118+
cgh.parallel_for<KernelID<case_num>>(
119+
Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
120+
uint16_t globalID = ndi.get_global_id(0);
121+
uint32_t off = globalID * SurfaceSize;
122+
123+
unsigned width = SurfaceWidth * sizeof(T) - 1;
124+
unsigned height = SurfaceHeight - 1;
125+
unsigned pitch = SurfacePitch * sizeof(T) - 1;
126+
127+
simd<T, N> vals;
128+
if constexpr (use_prefetch) {
129+
lsc_flat_prefetch2d<T, BlockWidth, BlockHeight, NBlocks,
130+
Transposed, Transformed, L1H, L3H>(
131+
in + off, width, height, pitch, X, Y);
132+
vals = lsc_flat_load2d<T, BlockWidth, BlockHeight, NBlocks,
133+
Transposed, Transformed>(
134+
in + off, width, height, pitch, X, Y);
135+
} else {
136+
vals = lsc_flat_load2d<T, BlockWidth, BlockHeight, NBlocks,
137+
Transposed, Transformed, L1H, L3H>(
138+
in + off, width, height, pitch, X, Y);
139+
}
140+
141+
for (int i = 0; i < NBlocks; i++) {
142+
for (int j = 0; j < SH; j++) {
143+
simd<T, SN> v =
144+
vals.template select<SN, 1>(i * SN * SH + j * SW);
145+
lsc_flat_store2d<T, SW>(
146+
out + off, SurfaceWidth * sizeof(T) - 1, SurfaceHeight - 1,
147+
SurfacePitch * sizeof(T) - 1, X + i * SW, Y + j, v);
148+
}
149+
}
150+
});
151+
});
152+
e.wait();
153+
} catch (cl::sycl::exception const &e) {
154+
std::cout << "SYCL exception caught: " << e.what() << '\n';
155+
sycl::free(out, ctx);
156+
sycl::free(in, ctx);
157+
return false;
158+
}
159+
160+
bool passed = true;
161+
162+
if constexpr (Transposed) {
163+
for (int gid = 0; gid < Groups * Threads; gid++) {
164+
int dx = 0, dy = 0;
165+
for (int j = 0; j < SurfaceHeight; j++) {
166+
for (int i = 0; i < SurfacePitch; i++) {
167+
T e = old_val;
168+
// index in linear buffer
169+
int idx = i + j * SurfacePitch + gid * SurfaceSize;
170+
171+
// check if inside block
172+
if ((i >= X) && (i < X + BlockWidth) && (j >= Y) &&
173+
(j < Y + BlockHeight)) {
174+
if (i < SurfaceWidth) {
175+
if (X + dx < SurfaceWidth)
176+
e = in[(X + dx) + (Y + dy) * SurfacePitch + gid * SurfaceSize];
177+
else
178+
e = (T)0;
179+
}
180+
dy += 1;
181+
if (dy == BlockHeight) {
182+
dy = 0;
183+
dx += 1;
184+
}
185+
}
186+
187+
if (out[idx] != e) {
188+
passed = false;
189+
std::cout << "out" << idx << "] = 0x" << std::hex
190+
<< (uint64_t)out[idx] << " vs etalon = 0x" << (uint64_t)e
191+
<< std::dec << std::endl;
192+
}
193+
}
194+
}
195+
}
196+
} else if constexpr (Transformed) {
197+
constexpr int scale = 4 / sizeof(T);
198+
for (int gid = 0; gid < Groups * Threads; gid++) {
199+
for (int j = 0; j < SurfaceHeight; j++) {
200+
for (int i = 0; i < SurfacePitch; i++) {
201+
T e = old_val;
202+
// index in linear buffer
203+
int idx = i + j * SurfacePitch + gid * SurfaceSize;
204+
205+
// check if inside block
206+
if ((i >= X) && (i < X + SW * NBlocks) && (j >= Y) && (j < Y + SH)) {
207+
int di = i - X;
208+
int dj = j - Y;
209+
int bn = di / SW;
210+
211+
int dx, dy;
212+
dx = di / scale + bn * (BlockWidth - SW / scale) +
213+
(dj % scale) * SW / scale;
214+
dy = dj + di % scale - dj % scale;
215+
216+
if (i < SurfaceWidth) {
217+
if (dx < BlockWidth * (bn + 1) && (dx + X) < SurfaceWidth &&
218+
(dy + Y) < SurfaceHeight)
219+
e = in[(X + dx) + (Y + dy) * SurfacePitch + gid * SurfaceSize];
220+
else
221+
e = (T)0;
222+
}
223+
}
224+
225+
if (out[idx] != e) {
226+
passed = false;
227+
std::cout << std::hex << "out[0x" << idx << "] = 0x"
228+
<< (uint64_t)out[idx] << " vs etalon = 0x" << (uint64_t)e
229+
<< std::dec << std::endl;
230+
}
231+
}
232+
}
233+
}
234+
} else {
235+
for (int gid = 0; gid < Groups * Threads; gid++) {
236+
for (int j = 0; j < SurfaceHeight; j++) {
237+
for (int i = 0; i < SurfacePitch; i++) {
238+
T e = old_val;
239+
// index in linear buffer
240+
int idx = i + j * SurfacePitch + gid * SurfaceSize;
241+
242+
// check if inside block
243+
if ((i >= X) && (i < X + BlockWidth * NBlocks) &&
244+
(i < SurfaceWidth) && (j >= Y) && (j < Y + BlockHeight))
245+
e = in[idx];
246+
247+
if (out[idx] != e) {
248+
passed = false;
249+
std::cout << "out[" << idx << "] = 0x" << std::hex
250+
<< (uint64_t)out[idx] << " vs etalon = 0x" << (uint64_t)e
251+
<< std::dec << std::endl;
252+
}
253+
}
254+
}
255+
}
256+
}
257+
258+
if (!passed)
259+
std::cout << "Case #" << case_num << " FAILED" << std::endl;
260+
261+
sycl::free(out, ctx);
262+
sycl::free(in, ctx);
263+
264+
return passed;
265+
}
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#include <CL/sycl.hpp>
2+
#include <sycl/ext/intel/experimental/esimd.hpp>
3+
4+
#include <iostream>
5+
6+
#include "common.hpp"
7+
8+
using namespace cl::sycl;
9+
using namespace sycl::ext::intel::experimental::esimd;
10+
11+
template <int case_num, typename T, uint32_t Groups, uint32_t Threads,
12+
int BlockWidth, int BlockHeight = 1,
13+
int N = get_lsc_block_2d_data_size<T, 1u, BlockHeight, BlockWidth,
14+
false, false>(),
15+
CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
16+
bool test(unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch,
17+
int X, int Y) {
18+
static_assert(BlockWidth > 0, "Block width must be positive");
19+
static_assert(BlockHeight > 0, "Block height must be positive");
20+
static_assert((sizeof(T) * BlockWidth) % 4 == 0,
21+
"Block width must be aligned by DW");
22+
static_assert(sizeof(T) * BlockWidth <= 64,
23+
"Block width must be 64B or less");
24+
static_assert(BlockHeight <= 8, "Block height must be 8 or less");
25+
26+
T old_val = get_rand<T>();
27+
T new_val = get_rand<T>();
28+
29+
auto GPUSelector = gpu_selector{};
30+
auto q = queue{GPUSelector};
31+
auto dev = q.get_device();
32+
std::cout << "Running case #" << case_num << " on "
33+
<< dev.get_info<info::device::name>() << "\n";
34+
auto ctx = q.get_context();
35+
36+
// workgroups
37+
cl::sycl::range<1> GlobalRange{Groups};
38+
// threads in each group
39+
cl::sycl::range<1> LocalRange{Threads};
40+
cl::sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
41+
42+
unsigned SurfaceSize = SurfacePitch * SurfaceHeight;
43+
unsigned Size = SurfaceSize * Groups * Threads;
44+
45+
T *out = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), dev, ctx));
46+
for (int i = 0; i < Size; i++)
47+
out[i] = old_val;
48+
49+
try {
50+
auto e = q.submit([&](handler &cgh) {
51+
cgh.parallel_for<KernelID<case_num>>(
52+
Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
53+
uint16_t globalID = ndi.get_global_id(0);
54+
uint32_t off = globalID * SurfaceSize;
55+
56+
simd<T, N> vals(new_val + off, 1);
57+
// IUT
58+
lsc_flat_store2d<T, BlockWidth, BlockHeight, false, false, L1H,
59+
L3H>(out + off, SurfaceWidth * sizeof(T) - 1,
60+
SurfaceHeight - 1,
61+
SurfacePitch * sizeof(T) - 1, X, Y, vals);
62+
});
63+
});
64+
e.wait();
65+
} catch (cl::sycl::exception const &e) {
66+
std::cout << "SYCL exception caught: " << e.what() << '\n';
67+
sycl::free(out, ctx);
68+
return false;
69+
}
70+
71+
bool passed = true;
72+
for (int gid = 0; gid < Groups * Threads; gid++) {
73+
T val = new_val + gid * SurfaceSize;
74+
75+
for (int j = 0; j < SurfaceHeight; j++) {
76+
for (int i = 0; i < SurfacePitch; i++) {
77+
T e = old_val;
78+
// check if inside block
79+
if ((i >= X) && (i < X + BlockWidth) && (i < SurfaceWidth) &&
80+
(j >= Y) && (j < Y + BlockHeight))
81+
e = val++;
82+
83+
// index in linear buffer
84+
int idx = i + j * SurfacePitch + gid * SurfaceSize;
85+
if (out[idx] != e) {
86+
passed = false;
87+
std::cout << "out[" << idx << "] = 0x" << std::hex
88+
<< (uint64_t)out[idx] << " vs etalon = 0x" << (uint64_t)e
89+
<< std::dec << std::endl;
90+
}
91+
}
92+
}
93+
}
94+
95+
if (!passed)
96+
std::cout << "Case #" << case_num << " FAILED" << std::endl;
97+
98+
sycl::free(out, ctx);
99+
100+
return passed;
101+
}

0 commit comments

Comments
 (0)