|
| 1 | +//==--------------- histogram_256_slm.cpp - DPC++ ESIMD on-device test ----==// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | +// TODO enable on Windows and Level Zero |
| 9 | +// REQUIRES: linux && gpu && opencl |
| 10 | +// RUN: %clangxx-esimd -fsycl %s -o %t.out |
| 11 | +// RUN: %ESIMD_RUN_PLACEHOLDER %t.out 16 |
| 12 | + |
| 13 | +#include "esimd_test_utils.hpp" |
| 14 | + |
| 15 | +#include <CL/sycl.hpp> |
| 16 | +#include <CL/sycl/INTEL/esimd.hpp> |
| 17 | +#include <iostream> |
| 18 | + |
| 19 | +static constexpr int NUM_BINS = 256; |
| 20 | +static constexpr int SLM_SIZE = (NUM_BINS * 4); |
| 21 | +static constexpr int BLOCK_WIDTH = 32; |
| 22 | +static constexpr int NUM_BLOCKS = 32; |
| 23 | + |
| 24 | +using namespace cl::sycl; |
| 25 | +using namespace sycl::INTEL::gpu; |
| 26 | + |
| 27 | +// Histogram kernel: computes the distribution of pixel intensities |
| 28 | +ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output, |
| 29 | + uint32_t gid, uint32_t lid, |
| 30 | + uint32_t local_size, uint32_t num_blocks) { |
| 31 | + // Declare and initialize SLM |
| 32 | + slm_init(SLM_SIZE); |
| 33 | + uint linear_id = gid * local_size + lid; |
| 34 | + |
| 35 | + simd<uint, 16> slm_offset(0, 1); |
| 36 | + slm_offset += 16 * lid; |
| 37 | + slm_offset *= sizeof(int); |
| 38 | + simd<uint, 16> slm_data = 0; |
| 39 | + slm_store<uint, 16>(slm_data, slm_offset); |
| 40 | + esimd_fence(ESIMD_GLOBAL_COHERENT_FENCE); |
| 41 | + esimd_barrier(); |
| 42 | + |
| 43 | + // Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks |
| 44 | + auto start_off = (linear_id * BLOCK_WIDTH * num_blocks); |
| 45 | + for (int y = 0; y < num_blocks; y++) { |
| 46 | + auto start_addr = ((unsigned int *)input_ptr) + start_off; |
| 47 | + auto data = block_load<uint, 32>(start_addr); |
| 48 | + auto in = data.format<uchar>(); |
| 49 | + |
| 50 | +#pragma unroll |
| 51 | + for (int j = 0; j < BLOCK_WIDTH * sizeof(int); j += 16) { |
| 52 | + // Accumulate local histogram for each pixel value |
| 53 | + auto dataOffset = convert<uint, uchar, 16>(in.select<16, 1>(j).read()); |
| 54 | + dataOffset *= sizeof(int); |
| 55 | + slm_atomic<EsimdAtomicOpType::ATOMIC_INC, uint, 16>(dataOffset, 1); |
| 56 | + } |
| 57 | + start_off += BLOCK_WIDTH; |
| 58 | + } |
| 59 | + esimd_fence(ESIMD_GLOBAL_COHERENT_FENCE); |
| 60 | + esimd_barrier(); |
| 61 | + |
| 62 | + // Update global sum by atomically adding each local histogram |
| 63 | + simd<uint, 16> local_histogram; |
| 64 | + local_histogram = slm_load<uint32_t, 16>(slm_offset); |
| 65 | + flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>( |
| 66 | + output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1); |
| 67 | + flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>( |
| 68 | + output, slm_offset.select<8, 1>(8), local_histogram.select<8, 1>(8), 1); |
| 69 | +} |
| 70 | + |
| 71 | +// This function calculates histogram of the image with the CPU. |
| 72 | +// @param size: the size of the input array. |
| 73 | +// @param src: pointer to the input array. |
| 74 | +// @param cpu_histogram: pointer to the histogram of the input image. |
| 75 | +void HistogramCPU(unsigned int size, unsigned int *src, |
| 76 | + unsigned int *cpu_histogram) { |
| 77 | + for (int i = 0; i < size; i++) { |
| 78 | + unsigned int x = src[i]; |
| 79 | + cpu_histogram[(x)&0xFFU] += 1; |
| 80 | + cpu_histogram[(x >> 8) & 0xFFU] += 1; |
| 81 | + cpu_histogram[(x >> 16) & 0xFFU] += 1; |
| 82 | + cpu_histogram[(x >> 24) & 0xFFU] += 1; |
| 83 | + } |
| 84 | +} |
| 85 | + |
| 86 | +// This function compares the output data calculated by the CPU and the |
| 87 | +// GPU separately. |
| 88 | +// If they are identical, return 1, else return 0. |
| 89 | +int CheckHistogram(unsigned int *cpu_histogram, unsigned int *gpu_histogram) { |
| 90 | + unsigned int bad = 0; |
| 91 | + for (int i = 0; i < NUM_BINS; i++) { |
| 92 | + if (cpu_histogram[i] != gpu_histogram[i]) { |
| 93 | + std::cout << "At " << i << ": CPU = " << cpu_histogram[i] |
| 94 | + << ", GPU = " << gpu_histogram[i] << std::endl; |
| 95 | + if (bad >= 256) |
| 96 | + return 0; |
| 97 | + bad++; |
| 98 | + } |
| 99 | + } |
| 100 | + if (bad > 0) |
| 101 | + return 0; |
| 102 | + |
| 103 | + return 1; |
| 104 | +} |
| 105 | + |
| 106 | +class NumBlocksConst; |
| 107 | +class histogram_slm; |
| 108 | + |
| 109 | +int main(int argc, char **argv) { |
| 110 | + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); |
| 111 | + auto dev = q.get_device(); |
| 112 | + auto ctxt = q.get_context(); |
| 113 | + |
| 114 | + const char *input_file = nullptr; |
| 115 | + unsigned int width = 1024 * sizeof(unsigned int); |
| 116 | + unsigned int height = 1024; |
| 117 | + |
| 118 | + // Initializes input. |
| 119 | + unsigned int input_size = width * height; |
| 120 | + unsigned int *input_ptr = |
| 121 | + (unsigned int *)malloc_shared(input_size, dev, ctxt); |
| 122 | + printf("Processing %dx%d inputs\n", (int)(width / sizeof(unsigned int)), |
| 123 | + height); |
| 124 | + |
| 125 | + srand(2009); |
| 126 | + input_size = input_size / sizeof(int); |
| 127 | + for (int i = 0; i < input_size; ++i) { |
| 128 | + input_ptr[i] = rand() % 256; |
| 129 | + input_ptr[i] |= (rand() % 256) << 8; |
| 130 | + input_ptr[i] |= (rand() % 256) << 16; |
| 131 | + input_ptr[i] |= (rand() % 256) << 24; |
| 132 | + } |
| 133 | + |
| 134 | + // Allocates system memory for output buffer. |
| 135 | + int buffer_size = sizeof(unsigned int) * NUM_BINS; |
| 136 | + unsigned int *hist = new unsigned int[buffer_size]; |
| 137 | + if (hist == nullptr) { |
| 138 | + std::cerr << "Out of memory\n"; |
| 139 | + exit(1); |
| 140 | + } |
| 141 | + memset(hist, 0, buffer_size); |
| 142 | + |
| 143 | + // Uses the CPU to calculate the histogram output data. |
| 144 | + unsigned int cpu_histogram[NUM_BINS]; |
| 145 | + memset(cpu_histogram, 0, sizeof(cpu_histogram)); |
| 146 | + |
| 147 | + HistogramCPU(input_size, input_ptr, cpu_histogram); |
| 148 | + |
| 149 | + std::cout << "finish cpu_histogram\n"; |
| 150 | + |
| 151 | + // Uses the GPU to calculate the histogram output data. |
| 152 | + unsigned int *output_surface = |
| 153 | + (uint32_t *)malloc_shared(4 * NUM_BINS, dev, ctxt); |
| 154 | + memset(output_surface, 0, 4 * NUM_BINS); |
| 155 | + |
| 156 | + unsigned int num_blocks{NUM_BLOCKS}; |
| 157 | + if (argc == 2) { |
| 158 | + num_blocks = atoi(argv[1]); |
| 159 | + std::cout << "new num_blocks = " << num_blocks << "\n"; |
| 160 | + } |
| 161 | + |
| 162 | + cl::sycl::program prg(q.get_context()); |
| 163 | + sycl::ONEAPI::experimental::spec_constant<unsigned int, NumBlocksConst> |
| 164 | + num_blocks_const = prg.set_spec_constant<NumBlocksConst>(num_blocks); |
| 165 | + prg.build_with_kernel_type<histogram_slm>(); |
| 166 | + |
| 167 | + unsigned int num_threads; |
| 168 | + num_threads = width * height / (num_blocks * BLOCK_WIDTH * sizeof(int)); |
| 169 | + |
| 170 | + auto GlobalRange = cl::sycl::range<1>(num_threads); |
| 171 | + auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16); |
| 172 | + cl::sycl::nd_range<1> Range(GlobalRange, LocalRange); |
| 173 | + |
| 174 | + { |
| 175 | + auto e = q.submit([&](cl::sycl::handler &cgh) { |
| 176 | + cgh.parallel_for<histogram_slm>( |
| 177 | + prg.get_kernel<histogram_slm>(), Range, |
| 178 | + [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { |
| 179 | + histogram_atomic(input_ptr, output_surface, ndi.get_group(0), |
| 180 | + ndi.get_local_id(0), 16, num_blocks_const.get()); |
| 181 | + }); |
| 182 | + }); |
| 183 | + e.wait(); |
| 184 | + } |
| 185 | + |
| 186 | + std::cout << "finish GPU histogram\n"; |
| 187 | + |
| 188 | + memcpy(hist, output_surface, 4 * NUM_BINS); |
| 189 | + |
| 190 | + free(output_surface, ctxt); |
| 191 | + |
| 192 | + free(input_ptr, ctxt); |
| 193 | + |
| 194 | + // Compares the CPU histogram output data with the |
| 195 | + // GPU histogram output data. |
| 196 | + // If there is no difference, the result is correct. |
| 197 | + // Otherwise there is something wrong. |
| 198 | + int res = CheckHistogram(cpu_histogram, hist); |
| 199 | + if (res) |
| 200 | + std::cout << "PASSED\n"; |
| 201 | + else |
| 202 | + std::cout << "FAILED\n"; |
| 203 | + |
| 204 | + return res ? 0 : -1; |
| 205 | +} |
0 commit comments