Skip to content

Commit 7f81052

Browse files
committed
[SYCL][ESIMD] Add on-device tests.
Add ESIMD tests which can run on Intel GPU devices. Authors: Gang Chen <[email protected]> Kaiyu Chen <[email protected]> Pratik Ashar <[email protected]> Konstantin S Bobrovsky <[email protected]> Signed-off-by: Konstantin S Bobrovsky <[email protected]>
1 parent fe7885c commit 7f81052

27 files changed

+5882
-0
lines changed

sycl/test/esimd/on-device/BitonicSortK.cpp

Lines changed: 718 additions & 0 deletions
Large diffs are not rendered by default.

sycl/test/esimd/on-device/BitonicSortKv2.cpp

Lines changed: 633 additions & 0 deletions
Large diffs are not rendered by default.
Binary file not shown.
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
// TODO enable on WIndows
2+
// REQUIRES: linux
3+
// REQUIRES: gpu
4+
// RUN: %clangxx-esimd -fsycl -D_CRT_SECURE_NO_WARNINGS=1 %s -o %t.out
5+
// RUN: %ESIMD_RUN_PLACEHOLDER %t.out
6+
7+
// This test checks that accessor-based memory accesses work correctly in ESIMD.
8+
9+
#include <CL/sycl.hpp>
10+
#include <CL/sycl/intel/esimd.hpp>
11+
12+
#include <iostream>
13+
14+
using namespace cl::sycl;
15+
16+
class ESIMDSelector : public device_selector {
17+
// Require GPU device unless HOST is requested in SYCL_DEVICE_TYPE env
18+
virtual int operator()(const device &device) const {
19+
if (const char *dev_type = getenv("SYCL_DEVICE_TYPE")) {
20+
if (!strcmp(dev_type, "GPU"))
21+
return device.is_gpu() ? 1000 : -1;
22+
if (!strcmp(dev_type, "HOST"))
23+
return device.is_host() ? 1000 : -1;
24+
std::cerr << "Supported 'SYCL_DEVICE_TYPE' env var values are 'GPU' and "
25+
"'HOST', '"
26+
<< dev_type << "' is not.\n";
27+
return -1;
28+
}
29+
// If "SYCL_DEVICE_TYPE" not defined, only allow gpu device
30+
return device.is_gpu() ? 1000 : -1;
31+
}
32+
};
33+
34+
auto exception_handler = [](exception_list l) {
35+
for (auto ep : l) {
36+
try {
37+
std::rethrow_exception(ep);
38+
} catch (cl::sycl::exception &e0) {
39+
std::cout << "sycl::exception: " << e0.what() << std::endl;
40+
} catch (std::exception &e) {
41+
std::cout << "std::exception: " << e.what() << std::endl;
42+
} catch (...) {
43+
std::cout << "generic exception\n";
44+
}
45+
}
46+
};
47+
48+
constexpr unsigned int VL = 1024 * 128;
49+
50+
using Ty = float;
51+
52+
int main() {
53+
Ty data0[VL] = {0};
54+
Ty data1[VL] = {0};
55+
constexpr Ty VAL = 5;
56+
57+
for (int i = 0; i < VL; i++) {
58+
data0[i] = i;
59+
}
60+
61+
try {
62+
queue q(ESIMDSelector{}, exception_handler);
63+
64+
buffer<Ty, 1> buf0(data0, range<1>(VL));
65+
buffer<Ty, 1> buf1(data1, range<1>(VL));
66+
67+
q.submit([&](handler &cgh) {
68+
std::cout << "Running on "
69+
<< q.get_device().get_info<cl::sycl::info::device::name>()
70+
<< "\n";
71+
72+
auto acc0 = buf0.get_access<access::mode::read_write>(cgh);
73+
auto acc1 = buf1.get_access<access::mode::write>(cgh);
74+
75+
cgh.parallel_for<class Test>(
76+
range<1>(1), [=](sycl::id<1> i) SYCL_ESIMD_KERNEL {
77+
using namespace sycl::intel::gpu;
78+
unsigned int offset = 0;
79+
for (int k = 0; k < VL / 16; k++) {
80+
simd<Ty, 16> var = block_load<Ty, 16>(acc0, offset);
81+
var += VAL;
82+
block_store(acc0, offset, var);
83+
block_store(acc1, offset, var + 1);
84+
offset += 64;
85+
}
86+
});
87+
});
88+
89+
q.wait();
90+
91+
} catch (cl::sycl::exception const &e) {
92+
std::cout << "SYCL exception caught: " << e.what() << '\n';
93+
return 2;
94+
}
95+
96+
int err_cnt = 0;
97+
98+
for (int i = 0; i < VL; i++) {
99+
Ty gold0 = i + VAL;
100+
Ty gold1 = gold0 + 1;
101+
Ty val0 = data0[i];
102+
Ty val1 = data1[i];
103+
104+
if (val0 != gold0) {
105+
if (++err_cnt < 10)
106+
std::cerr << "*** ERROR at data0[" << i << "]: " << val0
107+
<< " != " << gold0 << "(gold)\n";
108+
}
109+
if (val1 != gold1) {
110+
if (++err_cnt < 10)
111+
std::cerr << "*** ERROR at data1[" << i << "]: " << val1
112+
<< " != " << gold1 << "(gold)\n";
113+
}
114+
}
115+
if (err_cnt == 0) {
116+
std::cout << "Passed\n";
117+
return 0;
118+
} else {
119+
std::cout << "Failed: " << err_cnt << " of " << VL << " errors\n";
120+
return 1;
121+
}
122+
}
Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
// TODO enable on WIndows
2+
// REQUIRES: linux
3+
// REQUIRES: gpu
4+
// RUN: %clangxx-esimd -fsycl %s -o %t.out
5+
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
6+
// RUN: %ESIMD_RUN_PLACEHOLDER %t.out
7+
8+
#include <CL/sycl.hpp>
9+
#include <CL/sycl/intel/esimd.hpp>
10+
#include <array>
11+
#include <iostream>
12+
13+
using namespace cl::sycl;
14+
15+
class ESIMDSelector : public device_selector {
16+
// Require GPU device unless HOST is requested in SYCL_DEVICE_TYPE env
17+
virtual int operator()(const device &device) const {
18+
if (const char *dev_type = getenv("SYCL_DEVICE_TYPE")) {
19+
if (!strcmp(dev_type, "GPU"))
20+
return device.is_gpu() ? 1000 : -1;
21+
if (!strcmp(dev_type, "HOST"))
22+
return device.is_host() ? 1000 : -1;
23+
std::cerr << "Supported 'SYCL_DEVICE_TYPE' env var values are 'GPU' and "
24+
"'HOST', '"
25+
<< dev_type << "' is not.\n";
26+
return -1;
27+
}
28+
// If "SYCL_DEVICE_TYPE" not defined, only allow gpu device
29+
return device.is_gpu() ? 1000 : -1;
30+
}
31+
};
32+
33+
auto exception_handler = [](exception_list l) {
34+
for (auto ep : l) {
35+
try {
36+
std::rethrow_exception(ep);
37+
} catch (cl::sycl::exception &e0) {
38+
std::cout << "sycl::exception: " << e0.what() << std::endl;
39+
} catch (std::exception &e) {
40+
std::cout << "std::exception: " << e.what() << std::endl;
41+
} catch (...) {
42+
std::cout << "generic exception\n";
43+
}
44+
}
45+
};
46+
47+
#define NUM_BINS 256
48+
#define IMG_WIDTH 1024
49+
#define IMG_HEIGHT 1024
50+
//
51+
// each parallel_for handles 64x32 bytes
52+
//
53+
#define BLOCK_WIDTH 32
54+
#define BLOCK_HEIGHT 64
55+
56+
void histogram_CPU(unsigned int width, unsigned int height, unsigned char *srcY,
57+
unsigned int *cpuHistogram) {
58+
int i;
59+
for (i = 0; i < width * height; i++) {
60+
cpuHistogram[srcY[i]] += 1;
61+
}
62+
}
63+
64+
void writeHist(unsigned int *hist) {
65+
int total = 0;
66+
67+
std::cerr << "\nHistogram: \n";
68+
for (int i = 0; i < NUM_BINS; i += 8) {
69+
std::cerr << "\n [" << i << " - " << i + 7 << "]:";
70+
for (int j = 0; j < 8; j++) {
71+
std::cerr << "\t" << hist[i + j];
72+
total += hist[i + j];
73+
}
74+
}
75+
std::cerr << "\nTotal = " << total << " \n";
76+
}
77+
78+
int checkHistogram(unsigned int *refHistogram, unsigned int *hist) {
79+
80+
for (int i = 0; i < NUM_BINS; i++) {
81+
if (refHistogram[i] != hist[i]) {
82+
return 0;
83+
}
84+
}
85+
return 1;
86+
}
87+
88+
int main(int argc, char *argv[]) {
89+
90+
const char *input_file = nullptr;
91+
unsigned int width = IMG_WIDTH * sizeof(unsigned int);
92+
unsigned int height = IMG_HEIGHT;
93+
94+
if (argc == 2) {
95+
input_file = argv[1];
96+
} else {
97+
std::cerr << "Usage: Histogram.exe input_file" << std::endl;
98+
std::cerr << "No input file specificed. Use default random value ...."
99+
<< std::endl;
100+
}
101+
102+
// ------------------------------------------------------------------------
103+
// Read in image luma plane
104+
105+
// Allocate Input Buffer
106+
queue q(ESIMDSelector{}, exception_handler);
107+
108+
auto dev = q.get_device();
109+
auto ctxt = q.get_context();
110+
unsigned char *srcY =
111+
static_cast<unsigned char *>(malloc_shared(width * height, dev, ctxt));
112+
unsigned int *bins = static_cast<unsigned int *>(
113+
malloc_shared(NUM_BINS * sizeof(unsigned int), dev, ctxt));
114+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
115+
116+
uint range_width = width / BLOCK_WIDTH;
117+
uint range_height = height / BLOCK_HEIGHT;
118+
119+
if (srcY == NULL) {
120+
std::cerr << "Out of memory\n";
121+
exit(1);
122+
}
123+
124+
// Initializes input.
125+
unsigned int input_size = width * height;
126+
std::cerr << "Processing inputs\n";
127+
128+
if (input_file != nullptr) {
129+
FILE *f = fopen(input_file, "rb");
130+
if (f == NULL) {
131+
std::cerr << "Error opening file " << input_file;
132+
std::exit(1);
133+
}
134+
135+
unsigned int cnt = fread(srcY, sizeof(unsigned char), input_size, f);
136+
if (cnt != input_size) {
137+
std::cerr << "Error reading input from " << input_file;
138+
std::exit(1);
139+
}
140+
} else {
141+
srand(2009);
142+
for (int i = 0; i < input_size; ++i) {
143+
srcY[i] = rand() % 256;
144+
}
145+
}
146+
147+
for (int i = 0; i < NUM_BINS; i++) {
148+
bins[i] = 0;
149+
}
150+
151+
// ------------------------------------------------------------------------
152+
// CPU Execution:
153+
154+
unsigned int cpuHistogram[NUM_BINS];
155+
memset(cpuHistogram, 0, sizeof(cpuHistogram));
156+
histogram_CPU(width, height, srcY, cpuHistogram);
157+
158+
cl::sycl::image<2> Img(srcY, image_channel_order::rgba,
159+
image_channel_type::unsigned_int32,
160+
range<2>{width / sizeof(uint4), height});
161+
162+
{
163+
// create ranges
164+
// We need that many workitems
165+
auto GlobalRange = range<1>(range_width * range_height);
166+
// Number of workitems in a workgroup
167+
auto LocalRange = range<1>(1);
168+
nd_range<1> Range(GlobalRange, LocalRange);
169+
170+
auto e = q.submit([&](handler &cgh) {
171+
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
172+
173+
cgh.parallel_for<class Hist>(
174+
Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
175+
using namespace sycl::intel::gpu;
176+
177+
// Get thread origin offsets
178+
uint tid = ndi.get_group(0);
179+
uint h_pos = (tid % range_width) * BLOCK_WIDTH;
180+
uint v_pos = (tid / range_width) * BLOCK_HEIGHT;
181+
182+
// Declare a 8x32 uchar matrix to store the input block pixel value
183+
simd<unsigned char, 8 * 32> in;
184+
185+
// Declare a vector to store the local histogram
186+
simd<unsigned int, NUM_BINS> histogram(0);
187+
188+
// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
189+
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
190+
// Perform 2D media block read to load 8x32 pixel block
191+
in =
192+
media_block_load<unsigned char, 8, 32>(readAcc, h_pos, v_pos);
193+
194+
// Accumulate local histogram for each pixel value
195+
#pragma unroll
196+
for (int i = 0; i < 8; i++) {
197+
#pragma unroll
198+
for (int j = 0; j < 32; j++) {
199+
histogram.select<1, 1>(in[i * 32 + j]) += 1;
200+
}
201+
}
202+
203+
// Update starting offset for the next work block
204+
v_pos += 8;
205+
}
206+
207+
// Declare a vector to store the offset for atomic write operation
208+
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
209+
offset *= sizeof(unsigned int);
210+
211+
// Update global sum by atomically adding each local histogram
212+
#pragma unroll
213+
for (int i = 0; i < NUM_BINS; i += 8) {
214+
// Declare a vector to store the source for atomic write operation
215+
simd<unsigned int, 8> src;
216+
src = histogram.select<8, 1>(i);
217+
218+
#ifdef __SYCL_DEVICE_ONLY__
219+
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>(
220+
bins, offset, src, 1);
221+
offset += 8 * sizeof(unsigned int);
222+
#else
223+
auto vals = block_load<unsigned int, 8>(bins + i);
224+
vals = vals + src;
225+
block_store<unsigned int, 8>(bins + i, vals);
226+
#endif
227+
}
228+
});
229+
});
230+
e.wait();
231+
232+
// SYCL will enqueue and run the kernel. Recall that the buffer's data is
233+
// given back to the host at the end of scope.
234+
} // make sure data is given back to the host at the end of this scope
235+
236+
writeHist(bins);
237+
writeHist(cpuHistogram);
238+
// Checking Histogram
239+
if (checkHistogram(cpuHistogram, bins)) {
240+
std::cerr << "PASSED\n";
241+
return 0;
242+
} else {
243+
std::cerr << "FAILED\n";
244+
return 1;
245+
}
246+
247+
return 0;
248+
}

0 commit comments

Comments
 (0)