|
| 1 | +// TODO enable on WIndows |
| 2 | +// REQUIRES: linux |
| 3 | +// REQUIRES: gpu |
| 4 | +// RUN: %clangxx-esimd -fsycl %s -o %t.out |
| 5 | +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out |
| 6 | +// RUN: %ESIMD_RUN_PLACEHOLDER %t.out |
| 7 | + |
| 8 | +#include <CL/sycl.hpp> |
| 9 | +#include <CL/sycl/intel/esimd.hpp> |
| 10 | +#include <array> |
| 11 | +#include <iostream> |
| 12 | + |
| 13 | +using namespace cl::sycl; |
| 14 | + |
| 15 | +class ESIMDSelector : public device_selector { |
| 16 | + // Require GPU device unless HOST is requested in SYCL_DEVICE_TYPE env |
| 17 | + virtual int operator()(const device &device) const { |
| 18 | + if (const char *dev_type = getenv("SYCL_DEVICE_TYPE")) { |
| 19 | + if (!strcmp(dev_type, "GPU")) |
| 20 | + return device.is_gpu() ? 1000 : -1; |
| 21 | + if (!strcmp(dev_type, "HOST")) |
| 22 | + return device.is_host() ? 1000 : -1; |
| 23 | + std::cerr << "Supported 'SYCL_DEVICE_TYPE' env var values are 'GPU' and " |
| 24 | + "'HOST', '" |
| 25 | + << dev_type << "' is not.\n"; |
| 26 | + return -1; |
| 27 | + } |
| 28 | + // If "SYCL_DEVICE_TYPE" not defined, only allow gpu device |
| 29 | + return device.is_gpu() ? 1000 : -1; |
| 30 | + } |
| 31 | +}; |
| 32 | + |
| 33 | +auto exception_handler = [](exception_list l) { |
| 34 | + for (auto ep : l) { |
| 35 | + try { |
| 36 | + std::rethrow_exception(ep); |
| 37 | + } catch (cl::sycl::exception &e0) { |
| 38 | + std::cout << "sycl::exception: " << e0.what() << std::endl; |
| 39 | + } catch (std::exception &e) { |
| 40 | + std::cout << "std::exception: " << e.what() << std::endl; |
| 41 | + } catch (...) { |
| 42 | + std::cout << "generic exception\n"; |
| 43 | + } |
| 44 | + } |
| 45 | +}; |
| 46 | + |
| 47 | +#define NUM_BINS 256 |
| 48 | +#define IMG_WIDTH 1024 |
| 49 | +#define IMG_HEIGHT 1024 |
| 50 | +// |
| 51 | +// each parallel_for handles 64x32 bytes |
| 52 | +// |
| 53 | +#define BLOCK_WIDTH 32 |
| 54 | +#define BLOCK_HEIGHT 64 |
| 55 | + |
| 56 | +void histogram_CPU(unsigned int width, unsigned int height, unsigned char *srcY, |
| 57 | + unsigned int *cpuHistogram) { |
| 58 | + int i; |
| 59 | + for (i = 0; i < width * height; i++) { |
| 60 | + cpuHistogram[srcY[i]] += 1; |
| 61 | + } |
| 62 | +} |
| 63 | + |
| 64 | +void writeHist(unsigned int *hist) { |
| 65 | + int total = 0; |
| 66 | + |
| 67 | + std::cerr << "\nHistogram: \n"; |
| 68 | + for (int i = 0; i < NUM_BINS; i += 8) { |
| 69 | + std::cerr << "\n [" << i << " - " << i + 7 << "]:"; |
| 70 | + for (int j = 0; j < 8; j++) { |
| 71 | + std::cerr << "\t" << hist[i + j]; |
| 72 | + total += hist[i + j]; |
| 73 | + } |
| 74 | + } |
| 75 | + std::cerr << "\nTotal = " << total << " \n"; |
| 76 | +} |
| 77 | + |
| 78 | +int checkHistogram(unsigned int *refHistogram, unsigned int *hist) { |
| 79 | + |
| 80 | + for (int i = 0; i < NUM_BINS; i++) { |
| 81 | + if (refHistogram[i] != hist[i]) { |
| 82 | + return 0; |
| 83 | + } |
| 84 | + } |
| 85 | + return 1; |
| 86 | +} |
| 87 | + |
| 88 | +int main(int argc, char *argv[]) { |
| 89 | + |
| 90 | + const char *input_file = nullptr; |
| 91 | + unsigned int width = IMG_WIDTH * sizeof(unsigned int); |
| 92 | + unsigned int height = IMG_HEIGHT; |
| 93 | + |
| 94 | + if (argc == 2) { |
| 95 | + input_file = argv[1]; |
| 96 | + } else { |
| 97 | + std::cerr << "Usage: Histogram.exe input_file" << std::endl; |
| 98 | + std::cerr << "No input file specificed. Use default random value ...." |
| 99 | + << std::endl; |
| 100 | + } |
| 101 | + |
| 102 | + // ------------------------------------------------------------------------ |
| 103 | + // Read in image luma plane |
| 104 | + |
| 105 | + // Allocate Input Buffer |
| 106 | + queue q(ESIMDSelector{}, exception_handler); |
| 107 | + |
| 108 | + auto dev = q.get_device(); |
| 109 | + auto ctxt = q.get_context(); |
| 110 | + unsigned char *srcY = |
| 111 | + static_cast<unsigned char *>(malloc_shared(width * height, dev, ctxt)); |
| 112 | + unsigned int *bins = static_cast<unsigned int *>( |
| 113 | + malloc_shared(NUM_BINS * sizeof(unsigned int), dev, ctxt)); |
| 114 | + std::cout << "Running on " << dev.get_info<info::device::name>() << "\n"; |
| 115 | + |
| 116 | + uint range_width = width / BLOCK_WIDTH; |
| 117 | + uint range_height = height / BLOCK_HEIGHT; |
| 118 | + |
| 119 | + if (srcY == NULL) { |
| 120 | + std::cerr << "Out of memory\n"; |
| 121 | + exit(1); |
| 122 | + } |
| 123 | + |
| 124 | + // Initializes input. |
| 125 | + unsigned int input_size = width * height; |
| 126 | + std::cerr << "Processing inputs\n"; |
| 127 | + |
| 128 | + if (input_file != nullptr) { |
| 129 | + FILE *f = fopen(input_file, "rb"); |
| 130 | + if (f == NULL) { |
| 131 | + std::cerr << "Error opening file " << input_file; |
| 132 | + std::exit(1); |
| 133 | + } |
| 134 | + |
| 135 | + unsigned int cnt = fread(srcY, sizeof(unsigned char), input_size, f); |
| 136 | + if (cnt != input_size) { |
| 137 | + std::cerr << "Error reading input from " << input_file; |
| 138 | + std::exit(1); |
| 139 | + } |
| 140 | + } else { |
| 141 | + srand(2009); |
| 142 | + for (int i = 0; i < input_size; ++i) { |
| 143 | + srcY[i] = rand() % 256; |
| 144 | + } |
| 145 | + } |
| 146 | + |
| 147 | + for (int i = 0; i < NUM_BINS; i++) { |
| 148 | + bins[i] = 0; |
| 149 | + } |
| 150 | + |
| 151 | + // ------------------------------------------------------------------------ |
| 152 | + // CPU Execution: |
| 153 | + |
| 154 | + unsigned int cpuHistogram[NUM_BINS]; |
| 155 | + memset(cpuHistogram, 0, sizeof(cpuHistogram)); |
| 156 | + histogram_CPU(width, height, srcY, cpuHistogram); |
| 157 | + |
| 158 | + cl::sycl::image<2> Img(srcY, image_channel_order::rgba, |
| 159 | + image_channel_type::unsigned_int32, |
| 160 | + range<2>{width / sizeof(uint4), height}); |
| 161 | + |
| 162 | + { |
| 163 | + // create ranges |
| 164 | + // We need that many workitems |
| 165 | + auto GlobalRange = range<1>(range_width * range_height); |
| 166 | + // Number of workitems in a workgroup |
| 167 | + auto LocalRange = range<1>(1); |
| 168 | + nd_range<1> Range(GlobalRange, LocalRange); |
| 169 | + |
| 170 | + auto e = q.submit([&](handler &cgh) { |
| 171 | + auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh); |
| 172 | + |
| 173 | + cgh.parallel_for<class Hist>( |
| 174 | + Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL { |
| 175 | + using namespace sycl::intel::gpu; |
| 176 | + |
| 177 | + // Get thread origin offsets |
| 178 | + uint tid = ndi.get_group(0); |
| 179 | + uint h_pos = (tid % range_width) * BLOCK_WIDTH; |
| 180 | + uint v_pos = (tid / range_width) * BLOCK_HEIGHT; |
| 181 | + |
| 182 | + // Declare a 8x32 uchar matrix to store the input block pixel value |
| 183 | + simd<unsigned char, 8 * 32> in; |
| 184 | + |
| 185 | + // Declare a vector to store the local histogram |
| 186 | + simd<unsigned int, NUM_BINS> histogram(0); |
| 187 | + |
| 188 | + // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block |
| 189 | + for (int y = 0; y < BLOCK_HEIGHT / 8; y++) { |
| 190 | + // Perform 2D media block read to load 8x32 pixel block |
| 191 | + in = |
| 192 | + media_block_load<unsigned char, 8, 32>(readAcc, h_pos, v_pos); |
| 193 | + |
| 194 | + // Accumulate local histogram for each pixel value |
| 195 | +#pragma unroll |
| 196 | + for (int i = 0; i < 8; i++) { |
| 197 | +#pragma unroll |
| 198 | + for (int j = 0; j < 32; j++) { |
| 199 | + histogram.select<1, 1>(in[i * 32 + j]) += 1; |
| 200 | + } |
| 201 | + } |
| 202 | + |
| 203 | + // Update starting offset for the next work block |
| 204 | + v_pos += 8; |
| 205 | + } |
| 206 | + |
| 207 | + // Declare a vector to store the offset for atomic write operation |
| 208 | + simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7 |
| 209 | + offset *= sizeof(unsigned int); |
| 210 | + |
| 211 | + // Update global sum by atomically adding each local histogram |
| 212 | +#pragma unroll |
| 213 | + for (int i = 0; i < NUM_BINS; i += 8) { |
| 214 | + // Declare a vector to store the source for atomic write operation |
| 215 | + simd<unsigned int, 8> src; |
| 216 | + src = histogram.select<8, 1>(i); |
| 217 | + |
| 218 | +#ifdef __SYCL_DEVICE_ONLY__ |
| 219 | + flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>( |
| 220 | + bins, offset, src, 1); |
| 221 | + offset += 8 * sizeof(unsigned int); |
| 222 | +#else |
| 223 | + auto vals = block_load<unsigned int, 8>(bins + i); |
| 224 | + vals = vals + src; |
| 225 | + block_store<unsigned int, 8>(bins + i, vals); |
| 226 | +#endif |
| 227 | + } |
| 228 | + }); |
| 229 | + }); |
| 230 | + e.wait(); |
| 231 | + |
| 232 | + // SYCL will enqueue and run the kernel. Recall that the buffer's data is |
| 233 | + // given back to the host at the end of scope. |
| 234 | + } // make sure data is given back to the host at the end of this scope |
| 235 | + |
| 236 | + writeHist(bins); |
| 237 | + writeHist(cpuHistogram); |
| 238 | + // Checking Histogram |
| 239 | + if (checkHistogram(cpuHistogram, bins)) { |
| 240 | + std::cerr << "PASSED\n"; |
| 241 | + return 0; |
| 242 | + } else { |
| 243 | + std::cerr << "FAILED\n"; |
| 244 | + return 1; |
| 245 | + } |
| 246 | + |
| 247 | + return 0; |
| 248 | +} |
0 commit comments