|
| 1 | +//==---------------- stencil.cpp - DPC++ ESIMD on-device test ------------==// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | +// TODO enable on Windows and Level Zero |
| 9 | +// REQUIRES: linux && gpu && opencl |
| 10 | +// RUN: %clangxx-esimd -fsycl %s -o %t.out |
| 11 | +// RUN: %HOST_RUN_PLACEHOLDER %t.out |
| 12 | +// RUN: %ESIMD_RUN_PLACEHOLDER %t.out |
| 13 | + |
| 14 | +#include "esimd_test_utils.hpp" |
| 15 | + |
| 16 | +#include <CL/sycl.hpp> |
| 17 | +#include <CL/sycl/INTEL/esimd.hpp> |
| 18 | +#include <iostream> |
| 19 | + |
| 20 | +// |
| 21 | +// test smaller input size |
| 22 | +// test 8x16 block size |
| 23 | +// |
| 24 | +#define DIM_SIZE (1 << 10) |
| 25 | +#define SQUARE_SZ (DIM_SIZE * DIM_SIZE) |
| 26 | + |
| 27 | +#define WIDTH 16 |
| 28 | +#define HEIGHT 16 |
| 29 | + |
| 30 | +using namespace cl::sycl; |
| 31 | + |
| 32 | +void InitializeSquareMatrix(float *matrix, size_t const Dim, |
| 33 | + bool const bSkipDataGeneration) { |
| 34 | + memset(matrix, 0, Dim * Dim * sizeof(float)); |
| 35 | + if (!bSkipDataGeneration) { |
| 36 | + for (unsigned int iRow = 0; iRow < Dim; ++iRow) { |
| 37 | + for (unsigned int iCol = 0; iCol < Dim; ++iCol) { |
| 38 | + matrix[iRow * Dim + iCol] = static_cast<float>(iRow + iCol); |
| 39 | + } |
| 40 | + } |
| 41 | + } |
| 42 | +} |
| 43 | + |
| 44 | +bool CheckResults(float *out, float *in) { |
| 45 | + unsigned int n = DIM_SIZE; |
| 46 | + for (unsigned int i = 0; i < n; i++) { |
| 47 | + for (unsigned int j = 0; j < n; j++) { |
| 48 | + if ((5 <= i) && (i < n - 5) && (5 <= j) && (j < n - 5)) { |
| 49 | + float res = +in[(i - 5) * n + (j + 0)] * -0.02f + |
| 50 | + in[(i - 4) * n + (j + 0)] * -0.025f + |
| 51 | + in[(i - 3) * n + (j + 0)] * -0.0333333333333f + |
| 52 | + in[(i - 2) * n + (j + 0)] * -0.05f + |
| 53 | + in[(i - 1) * n + (j + 0)] * -0.1f + |
| 54 | + in[(i + 0) * n + (j - 5)] * -0.02f + |
| 55 | + in[(i + 0) * n + (j - 4)] * -0.025f + |
| 56 | + in[(i + 0) * n + (j - 3)] * -0.0333333333333f + |
| 57 | + in[(i + 0) * n + (j - 2)] * -0.05f + |
| 58 | + in[(i + 0) * n + (j - 1)] * -0.1f + |
| 59 | + in[(i + 0) * n + (j + 1)] * 0.1f + |
| 60 | + in[(i + 0) * n + (j + 2)] * 0.05f + |
| 61 | + in[(i + 0) * n + (j + 3)] * 0.0333333333333f + |
| 62 | + in[(i + 0) * n + (j + 4)] * 0.025f + |
| 63 | + in[(i + 0) * n + (j + 5)] * 0.02f + |
| 64 | + in[(i + 1) * n + (j + 0)] * 0.1f + |
| 65 | + in[(i + 2) * n + (j + 0)] * 0.05f + |
| 66 | + in[(i + 3) * n + (j + 0)] * 0.0333333333333f + |
| 67 | + in[(i + 4) * n + (j + 0)] * 0.025f + |
| 68 | + in[(i + 5) * n + (j + 0)] * 0.02f; |
| 69 | + |
| 70 | + // check result |
| 71 | + if (abs(res - out[i * n + j]) >= 0.0015f) { |
| 72 | + std::cout << "out[" << i << "][" << j << "] = " << out[i * n + j] |
| 73 | + << " expect result " << res << std::endl; |
| 74 | + return false; |
| 75 | + } |
| 76 | + } |
| 77 | + } |
| 78 | + } |
| 79 | + return true; |
| 80 | +} |
| 81 | + |
| 82 | +int main(void) { |
| 83 | + uint range_width = |
| 84 | + (DIM_SIZE - 10) / WIDTH + (((DIM_SIZE - 10) % WIDTH == 0) ? 0 : 1); |
| 85 | + uint range_height = |
| 86 | + (DIM_SIZE - 10) / HEIGHT + (((DIM_SIZE - 10) % HEIGHT == 0) ? 0 : 1); |
| 87 | + cl::sycl::range<2> GlobalRange{range_width, range_height}; |
| 88 | + |
| 89 | + std::cout << "width = " << range_width << " height = " << range_height |
| 90 | + << std::endl; |
| 91 | + cl::sycl::range<2> LocalRange{1, 1}; |
| 92 | + |
| 93 | + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); |
| 94 | + |
| 95 | + auto dev = q.get_device(); |
| 96 | + std::cout << "Running on " << dev.get_info<info::device::name>() << "\n"; |
| 97 | + auto ctxt = q.get_context(); |
| 98 | + |
| 99 | + // create and init matrices |
| 100 | + float *inputMatrix = |
| 101 | + static_cast<float *>(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt)); |
| 102 | + float *outputMatrix = |
| 103 | + static_cast<float *>(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt)); |
| 104 | + InitializeSquareMatrix(inputMatrix, DIM_SIZE, false); |
| 105 | + InitializeSquareMatrix(outputMatrix, DIM_SIZE, true); |
| 106 | + |
| 107 | + auto e = q.submit([&](handler &cgh) { |
| 108 | + cgh.parallel_for<class Stencil_kernel>( |
| 109 | + GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL { |
| 110 | + using namespace sycl::INTEL::gpu; |
| 111 | + uint h_pos = it.get_id(0); |
| 112 | + uint v_pos = it.get_id(1); |
| 113 | + |
| 114 | + simd<float, (HEIGHT + 10) * 32> vin; |
| 115 | + // matrix HEIGHT+10 x 32 |
| 116 | + auto in = vin.format<float, HEIGHT + 10, 32>(); |
| 117 | + |
| 118 | + // |
| 119 | + // rather than loading all data in |
| 120 | + // the code will interleave data loading and compute |
| 121 | + // first, we load enough data for the first 16 pixels |
| 122 | + // |
| 123 | + unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH; |
| 124 | +#pragma unroll |
| 125 | + for (unsigned i = 0; i < 10; i++) { |
| 126 | + in.row(i) = block_load<float, 32>(inputMatrix + off); |
| 127 | + off += DIM_SIZE; |
| 128 | + } |
| 129 | + |
| 130 | + unsigned out_off = |
| 131 | + (((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) * |
| 132 | + sizeof(float); |
| 133 | + simd<unsigned, WIDTH> elm16(0, 1); |
| 134 | + |
| 135 | +#pragma unroll |
| 136 | + for (unsigned i = 0; i < HEIGHT; i++) { |
| 137 | + |
| 138 | + in.row(10 + i) = block_load<float, 32>(inputMatrix + off); |
| 139 | + off += DIM_SIZE; |
| 140 | + |
| 141 | + simd<float, WIDTH> sum = |
| 142 | + in.row(i + 0).select<WIDTH, 1>(5) * -0.02f + |
| 143 | + in.row(i + 1).select<WIDTH, 1>(5) * -0.025f + |
| 144 | + in.row(i + 2).select<WIDTH, 1>(5) * -0.0333333333333f + |
| 145 | + in.row(i + 3).select<WIDTH, 1>(5) * -0.05f + |
| 146 | + in.row(i + 4).select<WIDTH, 1>(5) * -0.1f + |
| 147 | + in.row(i + 6).select<WIDTH, 1>(5) * 0.1f + |
| 148 | + in.row(i + 7).select<WIDTH, 1>(5) * 0.05f + |
| 149 | + in.row(i + 8).select<WIDTH, 1>(5) * 0.0333333333333f + |
| 150 | + in.row(i + 9).select<WIDTH, 1>(5) * 0.025f + |
| 151 | + in.row(i + 10).select<WIDTH, 1>(5) * 0.02f + |
| 152 | + in.row(i + 5).select<WIDTH, 1>(0) * -0.02f + |
| 153 | + in.row(i + 5).select<WIDTH, 1>(1) * -0.025f + |
| 154 | + in.row(i + 5).select<WIDTH, 1>(2) * -0.0333333333333f + |
| 155 | + in.row(i + 5).select<WIDTH, 1>(3) * -0.05f + |
| 156 | + in.row(i + 5).select<WIDTH, 1>(4) * -0.1f + |
| 157 | + in.row(i + 5).select<WIDTH, 1>(6) * 0.1f + |
| 158 | + in.row(i + 5).select<WIDTH, 1>(7) * 0.05f + |
| 159 | + in.row(i + 5).select<WIDTH, 1>(8) * 0.0333333333333f + |
| 160 | + in.row(i + 5).select<WIDTH, 1>(9) * 0.025f + |
| 161 | + in.row(i + 5).select<WIDTH, 1>(10) * 0.02f; |
| 162 | + |
| 163 | + // predciate output |
| 164 | + simd<ushort, WIDTH> p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10; |
| 165 | + |
| 166 | + simd<unsigned, WIDTH> elm16_off = elm16 * sizeof(float) + out_off; |
| 167 | + scatter<float, WIDTH>(outputMatrix, sum, elm16_off, p); |
| 168 | + out_off += DIM_SIZE * sizeof(float); |
| 169 | + |
| 170 | + if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1) |
| 171 | + break; |
| 172 | + } |
| 173 | + }); |
| 174 | + }); |
| 175 | + e.wait(); |
| 176 | + |
| 177 | + // check result |
| 178 | + bool passed = CheckResults(outputMatrix, inputMatrix); |
| 179 | + if (passed) { |
| 180 | + std::cout << "PASSED" << std::endl; |
| 181 | + } else { |
| 182 | + std::cout << "FAILED" << std::endl; |
| 183 | + } |
| 184 | + free(inputMatrix, ctxt); |
| 185 | + free(outputMatrix, ctxt); |
| 186 | + return 0; |
| 187 | +} |
0 commit comments