Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit c8d3a3a

Browse files
[SYCL] Add ESIMD tests with SYCL 2020 spec constants (#291)
1 parent 46fe69c commit c8d3a3a

File tree

3 files changed

+285
-0
lines changed

3 files changed

+285
-0
lines changed
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
// TODO enable on Windows
2+
// REQUIRES: linux && gpu
3+
// UNSUPPORTED: cuda
4+
// RUN: %clangxx -fsycl %s -o %t.out
5+
// RUN: %GPU_RUN_PLACEHOLDER %t.out 16
6+
7+
#include "esimd_test_utils.hpp"
8+
9+
#include <CL/sycl.hpp>
10+
#include <CL/sycl/INTEL/esimd.hpp>
11+
#include <iostream>
12+
13+
static constexpr int NUM_BINS = 256;
14+
static constexpr int SLM_SIZE = (NUM_BINS * 4);
15+
static constexpr int BLOCK_WIDTH = 32;
16+
static constexpr int NUM_BLOCKS = 32;
17+
18+
using namespace cl::sycl;
19+
using namespace sycl::ext::intel::experimental::esimd;
20+
21+
constexpr specialization_id<unsigned int> NumBlocksSpecId(NUM_BLOCKS);
22+
23+
// Histogram kernel: computes the distribution of pixel intensities
24+
ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
25+
uint32_t gid, uint32_t lid,
26+
uint32_t local_size, uint32_t num_blocks) {
27+
// Declare and initialize SLM
28+
slm_init(SLM_SIZE);
29+
uint linear_id = gid * local_size + lid;
30+
31+
simd<uint, 16> slm_offset(0, 1);
32+
slm_offset += 16 * lid;
33+
slm_offset *= sizeof(int);
34+
simd<uint, 16> slm_data = 0;
35+
slm_store<uint, 16>(slm_data, slm_offset);
36+
esimd_barrier();
37+
38+
// Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks
39+
auto start_off = (linear_id * BLOCK_WIDTH * num_blocks);
40+
for (int y = 0; y < num_blocks; y++) {
41+
auto start_addr = ((unsigned int *)input_ptr) + start_off;
42+
auto data = block_load<uint, 32>(start_addr);
43+
auto in = data.format<uchar>();
44+
45+
#pragma unroll
46+
for (int j = 0; j < BLOCK_WIDTH * sizeof(int); j += 16) {
47+
// Accumulate local histogram for each pixel value
48+
simd<uint, 16> dataOffset = in.select<16, 1>(j).read();
49+
dataOffset *= sizeof(int);
50+
slm_atomic<EsimdAtomicOpType::ATOMIC_INC, uint, 16>(dataOffset, 1);
51+
}
52+
start_off += BLOCK_WIDTH;
53+
}
54+
esimd_barrier();
55+
56+
// Update global sum by atomically adding each local histogram
57+
simd<uint, 16> local_histogram;
58+
local_histogram = slm_load<uint32_t, 16>(slm_offset);
59+
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
60+
output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1);
61+
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
62+
output, slm_offset.select<8, 1>(8), local_histogram.select<8, 1>(8), 1);
63+
}
64+
65+
// This function calculates histogram of the image with the CPU.
66+
// @param size: the size of the input array.
67+
// @param src: pointer to the input array.
68+
// @param cpu_histogram: pointer to the histogram of the input image.
69+
void HistogramCPU(unsigned int size, unsigned int *src,
70+
unsigned int *cpu_histogram) {
71+
for (int i = 0; i < size; i++) {
72+
unsigned int x = src[i];
73+
cpu_histogram[(x)&0xFFU] += 1;
74+
cpu_histogram[(x >> 8) & 0xFFU] += 1;
75+
cpu_histogram[(x >> 16) & 0xFFU] += 1;
76+
cpu_histogram[(x >> 24) & 0xFFU] += 1;
77+
}
78+
}
79+
80+
// This function compares the output data calculated by the CPU and the
81+
// GPU separately.
82+
// If they are identical, return 1, else return 0.
83+
int CheckHistogram(unsigned int *cpu_histogram, unsigned int *gpu_histogram) {
84+
unsigned int bad = 0;
85+
for (int i = 0; i < NUM_BINS; i++) {
86+
if (cpu_histogram[i] != gpu_histogram[i]) {
87+
std::cout << "At " << i << ": CPU = " << cpu_histogram[i]
88+
<< ", GPU = " << gpu_histogram[i] << std::endl;
89+
if (bad >= 256)
90+
return 0;
91+
bad++;
92+
}
93+
}
94+
if (bad > 0)
95+
return 0;
96+
97+
return 1;
98+
}
99+
100+
class NumBlocksConst;
101+
class histogram_slm;
102+
103+
int main(int argc, char **argv) {
104+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
105+
auto dev = q.get_device();
106+
auto ctxt = q.get_context();
107+
108+
const char *input_file = nullptr;
109+
unsigned int width = 1024 * sizeof(unsigned int);
110+
unsigned int height = 1024;
111+
112+
// Initializes input.
113+
unsigned int input_size = width * height;
114+
unsigned int *input_ptr =
115+
(unsigned int *)malloc_shared(input_size, dev, ctxt);
116+
printf("Processing %dx%d inputs\n", (int)(width / sizeof(unsigned int)),
117+
height);
118+
119+
srand(2009);
120+
input_size = input_size / sizeof(int);
121+
for (int i = 0; i < input_size; ++i) {
122+
input_ptr[i] = rand() % 256;
123+
input_ptr[i] |= (rand() % 256) << 8;
124+
input_ptr[i] |= (rand() % 256) << 16;
125+
input_ptr[i] |= (rand() % 256) << 24;
126+
}
127+
128+
// Allocates system memory for output buffer.
129+
int buffer_size = sizeof(unsigned int) * NUM_BINS;
130+
unsigned int *hist = new unsigned int[buffer_size];
131+
if (hist == nullptr) {
132+
std::cerr << "Out of memory\n";
133+
exit(1);
134+
}
135+
memset(hist, 0, buffer_size);
136+
137+
// Uses the CPU to calculate the histogram output data.
138+
unsigned int cpu_histogram[NUM_BINS];
139+
memset(cpu_histogram, 0, sizeof(cpu_histogram));
140+
141+
HistogramCPU(input_size, input_ptr, cpu_histogram);
142+
143+
std::cout << "finish cpu_histogram\n";
144+
145+
// Uses the GPU to calculate the histogram output data.
146+
unsigned int *output_surface =
147+
(uint32_t *)malloc_shared(4 * NUM_BINS, dev, ctxt);
148+
memset(output_surface, 0, 4 * NUM_BINS);
149+
150+
unsigned int num_blocks{NUM_BLOCKS};
151+
if (argc == 2) {
152+
num_blocks = atoi(argv[1]);
153+
std::cout << "new num_blocks = " << num_blocks << "\n";
154+
}
155+
156+
unsigned int num_threads;
157+
num_threads = width * height / (num_blocks * BLOCK_WIDTH * sizeof(int));
158+
159+
auto GlobalRange = cl::sycl::range<1>(num_threads);
160+
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
161+
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
162+
163+
try {
164+
auto e = q.submit([&](cl::sycl::handler &cgh) {
165+
cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
166+
cgh.parallel_for<histogram_slm>(
167+
Range,
168+
[=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
169+
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
170+
ndi.get_local_id(0), 16,
171+
kh.get_specialization_constant<NumBlocksSpecId>());
172+
});
173+
});
174+
e.wait();
175+
} catch (cl::sycl::exception const &e) {
176+
std::cout << "SYCL exception caught: " << e.what() << '\n';
177+
return e.get_cl_code();
178+
}
179+
180+
std::cout << "finish GPU histogram\n";
181+
182+
memcpy(hist, output_surface, 4 * NUM_BINS);
183+
184+
free(output_surface, ctxt);
185+
186+
free(input_ptr, ctxt);
187+
188+
// Compares the CPU histogram output data with the
189+
// GPU histogram output data.
190+
// If there is no difference, the result is correct.
191+
// Otherwise there is something wrong.
192+
int res = CheckHistogram(cpu_histogram, hist);
193+
if (res)
194+
std::cout << "PASSED\n";
195+
else
196+
std::cout << "FAILED\n";
197+
198+
return res ? 0 : -1;
199+
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// The test checks that ESIMD kernels support SYCL 2020 specialization constants
2+
// for all basic types, particularly a specialization constant can be redifined
3+
// and correct new value is used after redefinition.
4+
5+
#include "esimd_test_utils.hpp"
6+
7+
#include <CL/sycl.hpp>
8+
#include <CL/sycl/INTEL/esimd.hpp>
9+
10+
#include <iostream>
11+
#include <vector>
12+
13+
using namespace cl::sycl;
14+
15+
template <typename AccessorTy>
16+
ESIMD_INLINE void do_store(AccessorTy acc, int i, spec_const_t val) {
17+
using namespace sycl::ext::intel::experimental::esimd;
18+
// scatter function, that is used in scalar_store, can only process types
19+
// whose size is no more than 4 bytes.
20+
#if (STORE == 0)
21+
// bool
22+
scalar_store(acc, i, val ? 1 : 0);
23+
#elif (STORE == 1)
24+
// block
25+
block_store(acc, i, simd<spec_const_t, 2>{val});
26+
#else
27+
static_assert(STORE == 2, "Unspecified store");
28+
// scalar
29+
scalar_store(acc, i, val);
30+
#endif
31+
}
32+
33+
class TestKernel;
34+
35+
constexpr specialization_id<spec_const_t> ConstID(DEF_VAL);
36+
37+
int main(int argc, char **argv) {
38+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
39+
40+
auto dev = q.get_device();
41+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
42+
43+
std::vector<container_t> etalon = {DEF_VAL, REDEF_VAL};
44+
const size_t n_times = etalon.size();
45+
std::vector<container_t> output(n_times);
46+
47+
bool passed = true;
48+
for (int i = 0; i < n_times; i++) {
49+
try {
50+
sycl::buffer<container_t, 1> buf(output.data(), output.size());
51+
52+
q.submit([&](sycl::handler &cgh) {
53+
auto acc = buf.get_access<sycl::access::mode::write>(cgh);
54+
if (i % 2 != 0)
55+
cgh.set_specialization_constant<ConstID>(REDEF_VAL);
56+
cgh.single_task<TestKernel>([=](kernel_handler kh) SYCL_ESIMD_KERNEL {
57+
do_store(acc, i, kh.get_specialization_constant<ConstID>());
58+
});
59+
});
60+
} catch (cl::sycl::exception const &e) {
61+
std::cout << "SYCL exception caught: " << e.what() << '\n';
62+
return e.get_cl_code();
63+
}
64+
65+
if (output[i] != etalon[i]) {
66+
passed = false;
67+
std::cout << "comparison error -- case #" << i << " -- ";
68+
std::cout << "output: " << output[i] << ", ";
69+
std::cout << "etalon: " << etalon[i] << std::endl;
70+
}
71+
}
72+
73+
if (passed) {
74+
std::cout << "passed" << std::endl;
75+
return 0;
76+
}
77+
78+
std::cout << "FAILED" << std::endl;
79+
return 1;
80+
}

SYCL/ESIMD/spec_const/spec_const_ushort.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
// type size.
1717
// RUN: %clangxx -fsycl -I%S/.. %s -o %t.out
1818
// RUN: %GPU_RUN_PLACEHOLDER %t.out
19+
// RUN: %clangxx -fsycl -I%S/.. -DSYCL2020 %s -o %t.2020.out
20+
// RUN: %GPU_RUN_PLACEHOLDER %t.2020.out
1921
// UNSUPPORTED: cuda
2022

2123
#include <cstdint>
@@ -27,4 +29,8 @@
2729
using spec_const_t = uint16_t;
2830
using container_t = uint16_t;
2931

32+
#ifndef SYCL2020
3033
#include "Inputs/spec_const_common.hpp"
34+
#else
35+
#include "Inputs/spec-const-2020-common.hpp"
36+
#endif

0 commit comments

Comments
 (0)