Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 6d15120

Browse files
authored
[SYCL][ESIMD] Add ESIMD tests on constants specialization. (#35)
1 parent c97c9ad commit 6d15120

File tree

2 files changed

+396
-0
lines changed

2 files changed

+396
-0
lines changed

SYCL/ESIMD/histogram_256_slm_spec.cpp

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
//==--------------- histogram_256_slm.cpp - DPC++ ESIMD on-device test ----==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// TODO enable on Windows and Level Zero
9+
// REQUIRES: linux && gpu && opencl
10+
// RUN: %clangxx-esimd -fsycl %s -o %t.out
11+
// RUN: %ESIMD_RUN_PLACEHOLDER %t.out 16
12+
13+
#include "esimd_test_utils.hpp"
14+
15+
#include <CL/sycl.hpp>
16+
#include <CL/sycl/INTEL/esimd.hpp>
17+
#include <iostream>
18+
19+
static constexpr int NUM_BINS = 256;
20+
static constexpr int SLM_SIZE = (NUM_BINS * 4);
21+
static constexpr int BLOCK_WIDTH = 32;
22+
static constexpr int NUM_BLOCKS = 32;
23+
24+
using namespace cl::sycl;
25+
using namespace sycl::INTEL::gpu;
26+
27+
// Histogram kernel: computes the distribution of pixel intensities
28+
ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
29+
uint32_t gid, uint32_t lid,
30+
uint32_t local_size, uint32_t num_blocks) {
31+
// Declare and initialize SLM
32+
slm_init(SLM_SIZE);
33+
uint linear_id = gid * local_size + lid;
34+
35+
simd<uint, 16> slm_offset(0, 1);
36+
slm_offset += 16 * lid;
37+
slm_offset *= sizeof(int);
38+
simd<uint, 16> slm_data = 0;
39+
slm_store<uint, 16>(slm_data, slm_offset);
40+
esimd_fence(ESIMD_GLOBAL_COHERENT_FENCE);
41+
esimd_barrier();
42+
43+
// Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks
44+
auto start_off = (linear_id * BLOCK_WIDTH * num_blocks);
45+
for (int y = 0; y < num_blocks; y++) {
46+
auto start_addr = ((unsigned int *)input_ptr) + start_off;
47+
auto data = block_load<uint, 32>(start_addr);
48+
auto in = data.format<uchar>();
49+
50+
#pragma unroll
51+
for (int j = 0; j < BLOCK_WIDTH * sizeof(int); j += 16) {
52+
// Accumulate local histogram for each pixel value
53+
auto dataOffset = convert<uint, uchar, 16>(in.select<16, 1>(j).read());
54+
dataOffset *= sizeof(int);
55+
slm_atomic<EsimdAtomicOpType::ATOMIC_INC, uint, 16>(dataOffset, 1);
56+
}
57+
start_off += BLOCK_WIDTH;
58+
}
59+
esimd_fence(ESIMD_GLOBAL_COHERENT_FENCE);
60+
esimd_barrier();
61+
62+
// Update global sum by atomically adding each local histogram
63+
simd<uint, 16> local_histogram;
64+
local_histogram = slm_load<uint32_t, 16>(slm_offset);
65+
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
66+
output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1);
67+
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
68+
output, slm_offset.select<8, 1>(8), local_histogram.select<8, 1>(8), 1);
69+
}
70+
71+
// This function calculates histogram of the image with the CPU.
72+
// @param size: the size of the input array.
73+
// @param src: pointer to the input array.
74+
// @param cpu_histogram: pointer to the histogram of the input image.
75+
void HistogramCPU(unsigned int size, unsigned int *src,
76+
unsigned int *cpu_histogram) {
77+
for (int i = 0; i < size; i++) {
78+
unsigned int x = src[i];
79+
cpu_histogram[(x)&0xFFU] += 1;
80+
cpu_histogram[(x >> 8) & 0xFFU] += 1;
81+
cpu_histogram[(x >> 16) & 0xFFU] += 1;
82+
cpu_histogram[(x >> 24) & 0xFFU] += 1;
83+
}
84+
}
85+
86+
// This function compares the output data calculated by the CPU and the
87+
// GPU separately.
88+
// If they are identical, return 1, else return 0.
89+
int CheckHistogram(unsigned int *cpu_histogram, unsigned int *gpu_histogram) {
90+
unsigned int bad = 0;
91+
for (int i = 0; i < NUM_BINS; i++) {
92+
if (cpu_histogram[i] != gpu_histogram[i]) {
93+
std::cout << "At " << i << ": CPU = " << cpu_histogram[i]
94+
<< ", GPU = " << gpu_histogram[i] << std::endl;
95+
if (bad >= 256)
96+
return 0;
97+
bad++;
98+
}
99+
}
100+
if (bad > 0)
101+
return 0;
102+
103+
return 1;
104+
}
105+
106+
class NumBlocksConst;
107+
class histogram_slm;
108+
109+
int main(int argc, char **argv) {
110+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
111+
auto dev = q.get_device();
112+
auto ctxt = q.get_context();
113+
114+
const char *input_file = nullptr;
115+
unsigned int width = 1024 * sizeof(unsigned int);
116+
unsigned int height = 1024;
117+
118+
// Initializes input.
119+
unsigned int input_size = width * height;
120+
unsigned int *input_ptr =
121+
(unsigned int *)malloc_shared(input_size, dev, ctxt);
122+
printf("Processing %dx%d inputs\n", (int)(width / sizeof(unsigned int)),
123+
height);
124+
125+
srand(2009);
126+
input_size = input_size / sizeof(int);
127+
for (int i = 0; i < input_size; ++i) {
128+
input_ptr[i] = rand() % 256;
129+
input_ptr[i] |= (rand() % 256) << 8;
130+
input_ptr[i] |= (rand() % 256) << 16;
131+
input_ptr[i] |= (rand() % 256) << 24;
132+
}
133+
134+
// Allocates system memory for output buffer.
135+
int buffer_size = sizeof(unsigned int) * NUM_BINS;
136+
unsigned int *hist = new unsigned int[buffer_size];
137+
if (hist == nullptr) {
138+
std::cerr << "Out of memory\n";
139+
exit(1);
140+
}
141+
memset(hist, 0, buffer_size);
142+
143+
// Uses the CPU to calculate the histogram output data.
144+
unsigned int cpu_histogram[NUM_BINS];
145+
memset(cpu_histogram, 0, sizeof(cpu_histogram));
146+
147+
HistogramCPU(input_size, input_ptr, cpu_histogram);
148+
149+
std::cout << "finish cpu_histogram\n";
150+
151+
// Uses the GPU to calculate the histogram output data.
152+
unsigned int *output_surface =
153+
(uint32_t *)malloc_shared(4 * NUM_BINS, dev, ctxt);
154+
memset(output_surface, 0, 4 * NUM_BINS);
155+
156+
unsigned int num_blocks{NUM_BLOCKS};
157+
if (argc == 2) {
158+
num_blocks = atoi(argv[1]);
159+
std::cout << "new num_blocks = " << num_blocks << "\n";
160+
}
161+
162+
cl::sycl::program prg(q.get_context());
163+
sycl::ONEAPI::experimental::spec_constant<unsigned int, NumBlocksConst>
164+
num_blocks_const = prg.set_spec_constant<NumBlocksConst>(num_blocks);
165+
prg.build_with_kernel_type<histogram_slm>();
166+
167+
unsigned int num_threads;
168+
num_threads = width * height / (num_blocks * BLOCK_WIDTH * sizeof(int));
169+
170+
auto GlobalRange = cl::sycl::range<1>(num_threads);
171+
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
172+
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
173+
174+
{
175+
auto e = q.submit([&](cl::sycl::handler &cgh) {
176+
cgh.parallel_for<histogram_slm>(
177+
prg.get_kernel<histogram_slm>(), Range,
178+
[=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
179+
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
180+
ndi.get_local_id(0), 16, num_blocks_const.get());
181+
});
182+
});
183+
e.wait();
184+
}
185+
186+
std::cout << "finish GPU histogram\n";
187+
188+
memcpy(hist, output_surface, 4 * NUM_BINS);
189+
190+
free(output_surface, ctxt);
191+
192+
free(input_ptr, ctxt);
193+
194+
// Compares the CPU histogram output data with the
195+
// GPU histogram output data.
196+
// If there is no difference, the result is correct.
197+
// Otherwise there is something wrong.
198+
int res = CheckHistogram(cpu_histogram, hist);
199+
if (res)
200+
std::cout << "PASSED\n";
201+
else
202+
std::cout << "FAILED\n";
203+
204+
return res ? 0 : -1;
205+
}
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
//==---------------- mandelbrot.cpp - DPC++ ESIMD on-device test ----------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// TODO enable on Windows and Level Zero
9+
// REQUIRES: linux && gpu && opencl
10+
// RUN: %clangxx-esimd -fsycl %s -I%S/.. -o %t.out
11+
// RUN: %ESIMD_RUN_PLACEHOLDER %t.out %S/output.ppm %S/golden_hw.ppm 512 -2.09798 -1.19798 0.004 4.0
12+
13+
#include "esimd_test_utils.hpp"
14+
#include <CL/sycl.hpp>
15+
#include <CL/sycl/INTEL/esimd.hpp>
16+
#include <array>
17+
#include <iostream>
18+
#include <memory>
19+
20+
using namespace cl::sycl;
21+
using namespace sycl::INTEL::gpu;
22+
23+
#ifdef _SIM_MODE_
24+
#define CRUNCH 32
25+
#else
26+
#define CRUNCH 512 // emu/hw modes
27+
#endif
28+
29+
#define SCALE 0.004
30+
#define XOFF -2.09798
31+
#define YOFF -1.19798
32+
33+
#define WIDTH 800
34+
#define HEIGHT 602
35+
36+
template <typename ACC>
37+
ESIMD_INLINE void mandelbrot(ACC out_image, int ix, int iy, int crunch,
38+
float xOff, float yOff, float scale, float thrs) {
39+
ix *= 8;
40+
iy *= 2;
41+
42+
simd<int, 16> m = 0;
43+
44+
for (auto lane = 0; lane < 16; ++lane) {
45+
int ix_lane = ix + (lane & 0x7);
46+
int iy_lane = iy + (lane >> 3);
47+
float xPos = ix_lane * scale + xOff;
48+
float yPos = iy_lane * scale + yOff;
49+
float x = 0.0f;
50+
float y = 0.0f;
51+
float xx = 0.0f;
52+
float yy = 0.0f;
53+
54+
int mtemp = 0;
55+
do {
56+
y = x * y * 2.0f + yPos;
57+
x = xx - yy + xPos;
58+
yy = y * y;
59+
xx = x * x;
60+
mtemp += 1;
61+
} while ((mtemp < crunch) & (xx + yy < thrs));
62+
63+
m.select<1, 0>(lane) = mtemp;
64+
}
65+
66+
simd<int, 16> color = (((m * 15) & 0xff)) + (((m * 7) & 0xff) * 256) +
67+
(((m * 3) & 0xff) * 65536);
68+
69+
// because the output is a y-tile 2D surface
70+
// we can only write 32-byte wide
71+
media_block_store<unsigned char, 2, 32>(out_image, ix * sizeof(int), iy,
72+
color.format<unsigned char>());
73+
}
74+
75+
class CrunchConst;
76+
class XoffConst;
77+
class YoffConst;
78+
class ScaleConst;
79+
class ThrsConst;
80+
81+
class Test;
82+
83+
int main(int argc, char *argv[]) {
84+
if (argc != 3 && argc != 8) {
85+
std::cerr << "Usage: mandelbrot.exe output_file ref_file [crunch xoff yoff "
86+
"scale threshold]"
87+
<< std::endl;
88+
exit(1);
89+
}
90+
91+
// Gets the width and height of the input image.
92+
const unsigned img_size = WIDTH * HEIGHT * 4;
93+
// Sets output to blank image.
94+
unsigned char *buf = new unsigned char[img_size];
95+
96+
{
97+
cl::sycl::image<2> imgOutput((unsigned int *)buf, image_channel_order::rgba,
98+
image_channel_type::unsigned_int8,
99+
range<2>{WIDTH, HEIGHT});
100+
101+
// We need that many workitems
102+
uint range_width = WIDTH / 8;
103+
uint range_height = HEIGHT / 2;
104+
cl::sycl::range<2> GlobalRange{range_width, range_height};
105+
106+
// Number of workitems in a workgroup
107+
cl::sycl::range<2> LocalRange{1, 1};
108+
109+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
110+
111+
auto dev = q.get_device();
112+
auto ctxt = q.get_context();
113+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
114+
115+
int crunch{CRUNCH};
116+
float xoff{XOFF}, yoff{YOFF}, scale{SCALE}, thrs{4.0f};
117+
if (argc == 8) {
118+
crunch = atoi(argv[3]);
119+
xoff = (float)atof(argv[4]);
120+
yoff = (float)atof(argv[5]);
121+
scale = (float)atof(argv[6]);
122+
thrs = (float)atof(argv[7]);
123+
std::cout << "new crunch = " << crunch << ", xoff = " << xoff
124+
<< ", yoff = " << yoff << ", scale = " << scale
125+
<< ", thrs = " << thrs << "\n";
126+
}
127+
cl::sycl::program prg(q.get_context());
128+
sycl::ONEAPI::experimental::spec_constant<int, CrunchConst> crunch_const =
129+
prg.set_spec_constant<CrunchConst>(crunch);
130+
sycl::ONEAPI::experimental::spec_constant<float, XoffConst> xoff_const =
131+
prg.set_spec_constant<XoffConst>(xoff);
132+
sycl::ONEAPI::experimental::spec_constant<float, YoffConst> yoff_const =
133+
prg.set_spec_constant<YoffConst>(yoff);
134+
sycl::ONEAPI::experimental::spec_constant<float, ScaleConst> scale_const =
135+
prg.set_spec_constant<ScaleConst>(scale);
136+
sycl::ONEAPI::experimental::spec_constant<float, ThrsConst> thrs_const =
137+
prg.set_spec_constant<ThrsConst>(thrs);
138+
prg.build_with_kernel_type<Test>();
139+
140+
auto e = q.submit([&](cl::sycl::handler &cgh) {
141+
auto accOutput =
142+
imgOutput.get_access<uint4, cl::sycl::access::mode::write>(cgh);
143+
144+
cgh.parallel_for<Test>(prg.get_kernel<Test>(), GlobalRange * LocalRange,
145+
[=](item<2> it) SYCL_ESIMD_KERNEL {
146+
uint h_pos = it.get_id(0);
147+
uint v_pos = it.get_id(1);
148+
mandelbrot(accOutput, h_pos, v_pos,
149+
crunch_const.get(), xoff_const.get(),
150+
yoff_const.get(), scale_const.get(),
151+
thrs_const.get());
152+
});
153+
});
154+
e.wait();
155+
}
156+
157+
char *out_file = argv[1];
158+
FILE *dumpfile = fopen(out_file, "w");
159+
if (!dumpfile) {
160+
std::cerr << "Cannot open " << out_file << std::endl;
161+
return -2;
162+
}
163+
fprintf(dumpfile, "P6\x0d\x0a");
164+
fprintf(dumpfile, "%u %u\x0d\x0a", WIDTH, (HEIGHT - 2));
165+
fprintf(dumpfile, "%u\x0d\x0a", 255);
166+
fclose(dumpfile);
167+
dumpfile = fopen(out_file, "ab");
168+
for (int32_t i = 0; i < WIDTH * (HEIGHT - 2); ++i) {
169+
fwrite(&buf[i * 4], sizeof(char), 1, dumpfile);
170+
fwrite(&buf[i * 4 + 1], sizeof(char), 1, dumpfile);
171+
fwrite(&buf[i * 4 + 2], sizeof(char), 1, dumpfile);
172+
}
173+
fclose(dumpfile);
174+
175+
bool passed = true;
176+
if (!esimd_test::cmp_binary_files<unsigned char>(out_file, argv[2], 0)) {
177+
std::cerr << out_file << " does not match the reference file " << argv[2]
178+
<< std::endl;
179+
passed = false;
180+
}
181+
182+
delete[] buf;
183+
184+
if (passed) {
185+
std::cerr << "PASSED\n";
186+
return 0;
187+
} else {
188+
std::cerr << "FAILED\n";
189+
return 1;
190+
}
191+
}

0 commit comments

Comments
 (0)