Skip to content

Commit c9e82e3

Browse files
author
Gang Y Chen
committed
[SYCL][ESIMD] histogram 256 bin using SLM
Signed-off-by: Gang Y Chen <[email protected]>
1 parent 58eea55 commit c9e82e3

File tree

1 file changed

+191
-0
lines changed

1 file changed

+191
-0
lines changed
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
//==--------------- histogram_256_slm.cpp - DPC++ ESIMD on-device test ----==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// TODO enable on Windows
9+
// REQUIRES: linux
10+
// REQUIRES: gpu
11+
// RUN: %clangxx-esimd -fsycl %s -o %t.out
12+
// RUN: %ESIMD_RUN_PLACEHOLDER %t.out
13+
14+
#include "esimd_test_utils.hpp"
15+
16+
#include <CL/sycl.hpp>
17+
#include <CL/sycl/INTEL/esimd.hpp>
18+
#include <iostream>
19+
20+
static constexpr int NUM_BINS = 256;
21+
static constexpr int SLM_SIZE = (NUM_BINS * 4);
22+
static constexpr int BLOCK_WIDTH = 32;
23+
static constexpr int NUM_BLOCKS = 32;
24+
25+
using namespace cl::sycl;
26+
using namespace sycl::INTEL::gpu;
27+
28+
// Histogram kernel: computes the distribution of pixel intensities
29+
ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
30+
uint32_t gid, uint32_t lid,
31+
uint32_t local_size) {
32+
// Declare and initialize SLM
33+
slm_init(SLM_SIZE);
34+
uint linear_id = gid * local_size + lid;
35+
36+
simd<uint, 16> slm_offset(0, 1);
37+
slm_offset += 16 * lid;
38+
slm_offset *= sizeof(int);
39+
simd<uint, 16> slm_data = 0;
40+
slm_store<uint, 16>(slm_data, slm_offset);
41+
slm_fence(ESIMD_GLOBAL_COHERENT_FENCE);
42+
esimd_barrier();
43+
44+
// Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks
45+
auto start_off = (linear_id * BLOCK_WIDTH * NUM_BLOCKS);
46+
for (int y = 0; y < NUM_BLOCKS; y++) {
47+
auto start_addr = ((unsigned int *)input_ptr) + start_off;
48+
auto data = block_load<uint, 32>(start_addr);
49+
auto in = data.format<uchar>();
50+
51+
#pragma unroll
52+
for (int j = 0; j < BLOCK_WIDTH * sizeof(int); j += 16) {
53+
// Accumulate local histogram for each pixel value
54+
auto dataOffset = convert<uint, uchar, 16>(in.select<16, 1>(j).read());
55+
dataOffset *= sizeof(int);
56+
slm_atomic<EsimdAtomicOpType::ATOMIC_INC, uint, 16>(dataOffset, 1);
57+
}
58+
start_off += BLOCK_WIDTH;
59+
}
60+
slm_fence(ESIMD_GLOBAL_COHERENT_FENCE);
61+
esimd_barrier();
62+
63+
// Update global sum by atomically adding each local histogram
64+
simd<uint, 16> local_histogram;
65+
local_histogram = slm_load<uint32_t, 16>(slm_offset);
66+
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
67+
output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1);
68+
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
69+
output, slm_offset.select<8, 1>(8), local_histogram.select<8, 1>(8), 1);
70+
}
71+
72+
// This function calculates histogram of the image with the CPU.
73+
// @param size: the size of the input array.
74+
// @param src: pointer to the input array.
75+
// @param cpu_histogram: pointer to the histogram of the input image.
76+
void HistogramCPU(unsigned int size, unsigned int *src,
77+
unsigned int *cpu_histogram) {
78+
for (int i = 0; i < size; i++) {
79+
unsigned int x = src[i];
80+
cpu_histogram[(x)&0xFFU] += 1;
81+
cpu_histogram[(x >> 8) & 0xFFU] += 1;
82+
cpu_histogram[(x >> 16) & 0xFFU] += 1;
83+
cpu_histogram[(x >> 24) & 0xFFU] += 1;
84+
}
85+
}
86+
87+
// This function compares the output data calculated by the CPU and the
88+
// GPU separately.
89+
// If they are identical, return 1, else return 0.
90+
int CheckHistogram(unsigned int *cpu_histogram, unsigned int *gpu_histogram) {
91+
unsigned int bad = 0;
92+
for (int i = 0; i < NUM_BINS; i++) {
93+
if (cpu_histogram[i] != gpu_histogram[i]) {
94+
std::cout << "At " << i << ": CPU = " << cpu_histogram[i]
95+
<< ", GPU = " << gpu_histogram[i] << std::endl;
96+
if (bad >= 256)
97+
return 0;
98+
bad++;
99+
}
100+
}
101+
if (bad > 0)
102+
return 0;
103+
104+
return 1;
105+
}
106+
107+
int main() {
108+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
109+
auto dev = q.get_device();
110+
auto ctxt = q.get_context();
111+
112+
const char *input_file = nullptr;
113+
unsigned int width = 1024 * sizeof(unsigned int);
114+
unsigned int height = 1024;
115+
116+
// Initializes input.
117+
unsigned int input_size = width * height;
118+
unsigned int *input_ptr =
119+
(unsigned int *)malloc_shared(input_size, dev, ctxt);
120+
printf("Processing %dx%d inputs\n", (int)(width / sizeof(unsigned int)),
121+
height);
122+
123+
srand(2009);
124+
input_size = input_size / sizeof(int);
125+
for (int i = 0; i < input_size; ++i) {
126+
input_ptr[i] = rand() % 256;
127+
input_ptr[i] |= (rand() % 256) << 8;
128+
input_ptr[i] |= (rand() % 256) << 16;
129+
input_ptr[i] |= (rand() % 256) << 24;
130+
}
131+
132+
// Allocates system memory for output buffer.
133+
int buffer_size = sizeof(unsigned int) * NUM_BINS;
134+
unsigned int *hist = new unsigned int[buffer_size];
135+
if (hist == nullptr) {
136+
std::cerr << "Out of memory\n";
137+
exit(1);
138+
}
139+
memset(hist, 0, buffer_size);
140+
141+
// Uses the CPU to calculate the histogram output data.
142+
unsigned int cpu_histogram[NUM_BINS];
143+
memset(cpu_histogram, 0, sizeof(cpu_histogram));
144+
145+
HistogramCPU(input_size, input_ptr, cpu_histogram);
146+
147+
std::cout << "finish cpu_histogram\n";
148+
149+
// Uses the GPU to calculate the histogram output data.
150+
unsigned int *output_surface =
151+
(uint32_t *)malloc_shared(4 * NUM_BINS, dev, ctxt);
152+
memset(output_surface, 0, 4 * NUM_BINS);
153+
154+
unsigned int num_threads;
155+
num_threads = width * height / (NUM_BLOCKS * BLOCK_WIDTH * sizeof(int));
156+
157+
auto GlobalRange = cl::sycl::range<1>(num_threads);
158+
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
159+
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
160+
161+
{
162+
auto e = q.submit([&](cl::sycl::handler &cgh) {
163+
cgh.parallel_for<class histogram_slm>(
164+
Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
165+
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
166+
ndi.get_local_id(0), 16);
167+
});
168+
});
169+
e.wait();
170+
}
171+
172+
std::cout << "finish GPU histogram\n";
173+
174+
memcpy(hist, output_surface, 4 * NUM_BINS);
175+
176+
free(output_surface, ctxt);
177+
178+
free(input_ptr, ctxt);
179+
180+
// Compares the CPU histogram output data with the
181+
// GPU histogram output data.
182+
// If there is no difference, the result is correct.
183+
// Otherwise there is something wrong.
184+
int res = CheckHistogram(cpu_histogram, hist);
185+
if (res)
186+
std::cout << "PASSED\n";
187+
else
188+
std::cout << "FAILED\n";
189+
190+
return res ? 0 : -1;
191+
}

0 commit comments

Comments
 (0)