|
| 1 | +//==--------------- matrix_transpose2.cpp - DPC++ ESIMD on-device test ----==// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | +// REQUIRES: gpu |
| 9 | +// UNSUPPORTED: cuda |
| 10 | +// RUN: %clangxx -fsycl %s -o %t.out |
| 11 | +// RUN: %GPU_RUN_PLACEHOLDER %t.out |
| 12 | + |
| 13 | +// This test checks matrix transpose implementation with media block read/write |
| 14 | + |
| 15 | +#include "esimd_test_utils.hpp" |
| 16 | + |
| 17 | +#include <CL/sycl.hpp> |
| 18 | +#include <CL/sycl/INTEL/esimd.hpp> |
| 19 | +#include <iostream> |
| 20 | + |
| 21 | +using namespace cl::sycl; |
| 22 | +using namespace std; |
| 23 | +using namespace sycl::ext::intel::experimental::esimd; |
| 24 | + |
| 25 | +void initMatrix(int *M, unsigned N) { |
| 26 | + assert(N >= 8 && (((N - 1) & N) == 0) && |
| 27 | + "only power of 2 (>= 16) is supported"); |
| 28 | + for (unsigned i = 0; i < N * N; ++i) |
| 29 | + M[i] = i; |
| 30 | +} |
| 31 | + |
| 32 | +void printMatrix(const char *msg, int *M, unsigned N) { |
| 33 | + cerr << msg << "\n"; |
| 34 | + if (N > 64) { |
| 35 | + cerr << "<<<maitrix of size " << N << " x " << N << ">>>\n"; |
| 36 | + return; |
| 37 | + } |
| 38 | + |
| 39 | + for (unsigned i = 0; i < N; ++i) { |
| 40 | + for (unsigned j = 0; j < N; ++j) { |
| 41 | + cerr.width(4); |
| 42 | + cerr << M[i * N + j] << " "; |
| 43 | + } |
| 44 | + cerr << "\n"; |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | +bool checkResult(const int *M, unsigned N) { |
| 49 | + for (unsigned i = 0; i < N; ++i) { |
| 50 | + for (unsigned j = 0; j < N; ++j) { |
| 51 | + unsigned t = M[j * N + i]; |
| 52 | + if (t != i * N + j) { |
| 53 | + cerr << "Error at M(" << i << ", " << j << ") = " << t << "\n"; |
| 54 | + return false; |
| 55 | + } |
| 56 | + } |
| 57 | + } |
| 58 | + return true; |
| 59 | +} |
| 60 | + |
| 61 | +// The basic idea of vecotrizing transposition can be illustrated by |
| 62 | +// transposing a 2 x 2 matrix as follows: |
| 63 | +// |
| 64 | +// A B |
| 65 | +// C D |
| 66 | +// ==> |
| 67 | +// merge([A, A, B, B], [C, C, D, D], 0b1010) = [A, C, B, D] |
| 68 | +// ==> |
| 69 | +// A C |
| 70 | +// B D |
| 71 | +// |
| 72 | +template <typename T> |
| 73 | +ESIMD_INLINE simd<T, 64> transpose_matrix(simd<T, 64> v1) { |
| 74 | + simd<T, 64> v2; |
| 75 | + // mask to control how to merge two vectors. |
| 76 | + simd<uint16_t, 16> mask = 0; |
| 77 | + mask.select<8, 2>(0) = 1; |
| 78 | + auto t1 = v1.template bit_cast_view<T, 4, 16>(); |
| 79 | + auto t2 = v2.template bit_cast_view<T, 4, 16>(); |
| 80 | + |
| 81 | + // j = 1 |
| 82 | + t2.row(0).merge(t1.template replicate<8, 1, 2, 0>(0, 0), |
| 83 | + t1.template replicate<8, 1, 2, 0>(2, 0), mask); |
| 84 | + t2.row(1).merge(t1.template replicate<8, 1, 2, 0>(0, 8), |
| 85 | + t1.template replicate<8, 1, 2, 0>(2, 8), mask); |
| 86 | + t2.row(2).merge(t1.template replicate<8, 1, 2, 0>(1, 0), |
| 87 | + t1.template replicate<8, 1, 2, 0>(3, 0), mask); |
| 88 | + t2.row(3).merge(t1.template replicate<8, 1, 2, 0>(1, 8), |
| 89 | + t1.template replicate<8, 1, 2, 0>(3, 8), mask); |
| 90 | + |
| 91 | + // j = 2 |
| 92 | + t1.row(0).merge(t2.template replicate<8, 1, 2, 0>(0, 0), |
| 93 | + t2.template replicate<8, 1, 2, 0>(2, 0), mask); |
| 94 | + t1.row(1).merge(t2.template replicate<8, 1, 2, 0>(0, 8), |
| 95 | + t2.template replicate<8, 1, 2, 0>(2, 8), mask); |
| 96 | + t1.row(2).merge(t2.template replicate<8, 1, 2, 0>(1, 0), |
| 97 | + t2.template replicate<8, 1, 2, 0>(3, 0), mask); |
| 98 | + t1.row(3).merge(t2.template replicate<8, 1, 2, 0>(1, 8), |
| 99 | + t2.template replicate<8, 1, 2, 0>(3, 8), mask); |
| 100 | + |
| 101 | + // j = 4 |
| 102 | + t2.row(0).merge(t1.template replicate<8, 1, 2, 0>(0, 0), |
| 103 | + t1.template replicate<8, 1, 2, 0>(2, 0), mask); |
| 104 | + t2.row(1).merge(t1.template replicate<8, 1, 2, 0>(0, 8), |
| 105 | + t1.template replicate<8, 1, 2, 0>(2, 8), mask); |
| 106 | + t2.row(2).merge(t1.template replicate<8, 1, 2, 0>(1, 0), |
| 107 | + t1.template replicate<8, 1, 2, 0>(3, 0), mask); |
| 108 | + t2.row(3).merge(t1.template replicate<8, 1, 2, 0>(1, 8), |
| 109 | + t1.template replicate<8, 1, 2, 0>(3, 8), mask); |
| 110 | + return v2; |
| 111 | +} |
| 112 | + |
| 113 | +// read a N-by-N sub-matrix |
| 114 | +template <typename T, int N, typename AccessorTy> |
| 115 | +ESIMD_INLINE simd<T, N * N> read(AccessorTy img, int MZ, int col, int row) { |
| 116 | + static_assert(N == 8, "only 8x8 sub-matrix is supported"); |
| 117 | + |
| 118 | + simd<T, N * N> res; |
| 119 | + auto in = res.template bit_cast_view<unsigned char, 8, 32>(); |
| 120 | + in = media_block_load<unsigned char, 8, 32>(img, col * sizeof(T), row); |
| 121 | + |
| 122 | + return res; |
| 123 | +} |
| 124 | + |
| 125 | +// write a N-by-N sub-matrix |
| 126 | +template <typename T, int N, typename AccessorTy> |
| 127 | +ESIMD_INLINE void write(AccessorTy img, int MZ, int col, int row, |
| 128 | + simd<T, N * N> val) { |
| 129 | + static_assert(N == 8, "only 8x8 sub-matrix is supported"); |
| 130 | + |
| 131 | + auto out = val.template bit_cast_view<uchar, 8, 32>(); |
| 132 | + media_block_store<unsigned char, 8, 32>(img, col * sizeof(T), row, out); |
| 133 | +} |
| 134 | + |
| 135 | +// Square matrix transposition on block of size 8x8 |
| 136 | +// input and output are in the same image |
| 137 | +template <typename AccessorInTy, typename AccessorOutTy> |
| 138 | +ESIMD_INLINE void transpose8(AccessorInTy in, AccessorOutTy out, int MZ, |
| 139 | + int block_col, int block_row) { |
| 140 | + static const int N = 8; |
| 141 | + |
| 142 | + if (block_row == block_col) { |
| 143 | + auto m1 = read<int, N, AccessorInTy>(in, MZ, N * block_col, N * block_row); |
| 144 | + |
| 145 | + // cerr << m1 << std::endl; |
| 146 | + |
| 147 | + auto t1 = transpose_matrix(m1); |
| 148 | + |
| 149 | + // cerr << t1 << std::endl; |
| 150 | + |
| 151 | + write<int, N, AccessorOutTy>(out, MZ, N * block_row, N * block_col, t1); |
| 152 | + } else if (block_row < block_col) { |
| 153 | + // Read two blocks to be swapped. |
| 154 | + auto m1 = read<int, N, AccessorInTy>(in, MZ, N * block_col, N * block_row); |
| 155 | + auto m2 = read<int, N, AccessorInTy>(in, MZ, N * block_row, N * block_col); |
| 156 | + |
| 157 | + // Tranpose them. |
| 158 | + auto t1 = transpose_matrix(m1); |
| 159 | + auto t2 = transpose_matrix(m2); |
| 160 | + |
| 161 | + // Write them back to the transposed location. |
| 162 | + write<int, N, AccessorOutTy>(out, MZ, N * block_row, N * block_col, t1); |
| 163 | + write<int, N, AccessorOutTy>(out, MZ, N * block_col, N * block_row, t2); |
| 164 | + } |
| 165 | +} |
| 166 | + |
| 167 | +// Square matrix transposition on block of size 16x16. |
| 168 | +// In this version, a thread handle a block of size 16x16 which allows |
| 169 | +// to better latency hidding and subsentantially improve overall performance. |
| 170 | +// |
| 171 | +template <typename AccessorInTy, typename AccessorOutTy> |
| 172 | +ESIMD_INLINE void transpose16(AccessorInTy in, AccessorOutTy out, int MZ, |
| 173 | + int block_col, int block_row) { |
| 174 | + static const int N = 16; |
| 175 | + |
| 176 | + if (block_row == block_col) { |
| 177 | + // Read a tile of 4 8x8 sub-blocks: |
| 178 | + // |
| 179 | + // [ m00 m01 ] |
| 180 | + // [ m10 m11 ] |
| 181 | + // |
| 182 | + // matrix<int, 8, 8> m00, m01, m10, m11, t00, t01, t10, t11; |
| 183 | + auto m00 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 0), |
| 184 | + N * block_row + 0); |
| 185 | + auto m01 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 8), |
| 186 | + N * block_row + 0); |
| 187 | + auto m10 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 0), |
| 188 | + N * block_row + 8); |
| 189 | + auto m11 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 8), |
| 190 | + N * block_row + 8); |
| 191 | + |
| 192 | + // Tranpose sub-blocks. |
| 193 | + auto t00 = transpose_matrix(m00); |
| 194 | + auto t01 = transpose_matrix(m01); |
| 195 | + auto t10 = transpose_matrix(m10); |
| 196 | + auto t11 = transpose_matrix(m11); |
| 197 | + |
| 198 | + // write out as |
| 199 | + // |
| 200 | + // [t00 t10] |
| 201 | + // [t01 t11] |
| 202 | + // |
| 203 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 0), |
| 204 | + N * block_row + 0, t00); |
| 205 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 8), |
| 206 | + N * block_row + 0, t10); |
| 207 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 0), |
| 208 | + N * block_row + 8, t01); |
| 209 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 8), |
| 210 | + N * block_row + 8, t11); |
| 211 | + } else if (block_row < block_col) { |
| 212 | + // Read two tiles of 4 8x8 sub-blocks to be swapped. |
| 213 | + // |
| 214 | + // matrix<int, 8, 8> a00, a01, a10, a11, t00, t01, t10, t11; |
| 215 | + auto a00 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 0), |
| 216 | + N * block_row + 0); |
| 217 | + auto a01 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 8), |
| 218 | + N * block_row + 0); |
| 219 | + auto a10 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 0), |
| 220 | + N * block_row + 8); |
| 221 | + auto a11 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 8), |
| 222 | + N * block_row + 8); |
| 223 | + |
| 224 | + // matrix<int, 8, 8> b00, b01, b10, b11, r00, r01, r10, r11; |
| 225 | + auto b00 = read<int, 8, AccessorInTy>(in, MZ, (N * block_row + 0), |
| 226 | + N * block_col + 0); |
| 227 | + auto b01 = read<int, 8, AccessorInTy>(in, MZ, (N * block_row + 8), |
| 228 | + N * block_col + 0); |
| 229 | + auto b10 = read<int, 8, AccessorInTy>(in, MZ, (N * block_row + 0), |
| 230 | + N * block_col + 8); |
| 231 | + auto b11 = read<int, 8, AccessorInTy>(in, MZ, (N * block_row + 8), |
| 232 | + N * block_col + 8); |
| 233 | + |
| 234 | + // Tranpose the first tile. |
| 235 | + auto t00 = transpose_matrix(a00); |
| 236 | + auto t01 = transpose_matrix(a01); |
| 237 | + auto t10 = transpose_matrix(a10); |
| 238 | + auto t11 = transpose_matrix(a11); |
| 239 | + |
| 240 | + // Tranpose the second tile. |
| 241 | + auto r00 = transpose_matrix(b00); |
| 242 | + auto r01 = transpose_matrix(b01); |
| 243 | + auto r10 = transpose_matrix(b10); |
| 244 | + auto r11 = transpose_matrix(b11); |
| 245 | + |
| 246 | + // Write the first tile to the transposed location. |
| 247 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_row + 0), |
| 248 | + N * block_col + 0, t00); |
| 249 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_row + 8), |
| 250 | + N * block_col + 0, t10); |
| 251 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_row + 0), |
| 252 | + N * block_col + 8, t01); |
| 253 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_row + 8), |
| 254 | + N * block_col + 8, t11); |
| 255 | + |
| 256 | + // Write the second tile to the transposed location. |
| 257 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 0), |
| 258 | + N * block_row + 0, r00); |
| 259 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 8), |
| 260 | + N * block_row + 0, r10); |
| 261 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 0), |
| 262 | + N * block_row + 8, r01); |
| 263 | + write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 8), |
| 264 | + N * block_row + 8, r11); |
| 265 | + } |
| 266 | +} |
| 267 | + |
| 268 | +bool runTest(unsigned MZ, unsigned block_size) { |
| 269 | + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(), |
| 270 | + property::queue::enable_profiling{}); |
| 271 | + int *M = new int[MZ * MZ]; |
| 272 | + |
| 273 | + initMatrix(M, MZ); |
| 274 | + cerr << "\nTranspose square matrix of size " << MZ << "\n"; |
| 275 | + // printMatrix("Initial matrix:", M, MZ); |
| 276 | + |
| 277 | + // Each C-for-Metal thread works on one or two blocks of size 8 x 8. |
| 278 | + int thread_width = MZ / block_size; |
| 279 | + int thread_height = MZ / block_size; |
| 280 | + |
| 281 | + // create ranges |
| 282 | + // We need that many workitems |
| 283 | + auto GlobalRange = cl::sycl::range<2>(thread_width, thread_height); |
| 284 | + |
| 285 | + // Number of workitems in a workgroup |
| 286 | + cl::sycl::range<2> LocalRange{1, 1}; |
| 287 | + cl::sycl::nd_range<2> Range(GlobalRange, LocalRange); |
| 288 | + |
| 289 | + // Start timer. |
| 290 | + esimd_test::Timer timer; |
| 291 | + double start; |
| 292 | + |
| 293 | + // Launches the task on the GPU. |
| 294 | + double kernel_times = 0; |
| 295 | + unsigned num_iters = 10; |
| 296 | + |
| 297 | + try { |
| 298 | + // num_iters + 1, iteration#0 is for warmup |
| 299 | + for (int i = 0; i <= num_iters; ++i) { |
| 300 | + // make sure that image object has short live-range |
| 301 | + // than M |
| 302 | + cl::sycl::image<2> imgM((unsigned int *)M, image_channel_order::rgba, |
| 303 | + image_channel_type::unsigned_int32, |
| 304 | + range<2>{MZ / 4, MZ}); |
| 305 | + |
| 306 | + double etime = 0; |
| 307 | + if (block_size == 16 && MZ >= 16) { |
| 308 | + auto e = q.submit([&](handler &cgh) { |
| 309 | + auto accInput = |
| 310 | + imgM.get_access<uint4, cl::sycl::access::mode::read>(cgh); |
| 311 | + auto accOutput = |
| 312 | + imgM.get_access<uint4, cl::sycl::access::mode::write>(cgh); |
| 313 | + cgh.parallel_for<class K16>( |
| 314 | + Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { |
| 315 | + transpose16(accInput, accOutput, MZ, ndi.get_global_id(0), |
| 316 | + ndi.get_global_id(1)); |
| 317 | + }); |
| 318 | + }); |
| 319 | + e.wait(); |
| 320 | + etime = esimd_test::report_time("kernel time", e, e); |
| 321 | + } else if (block_size == 8) { |
| 322 | + auto e = q.submit([&](handler &cgh) { |
| 323 | + auto accInput = |
| 324 | + imgM.get_access<uint4, cl::sycl::access::mode::read>(cgh); |
| 325 | + auto accOutput = |
| 326 | + imgM.get_access<uint4, cl::sycl::access::mode::write>(cgh); |
| 327 | + cgh.parallel_for<class K08>( |
| 328 | + Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { |
| 329 | + transpose8(accInput, accOutput, MZ, ndi.get_global_id(0), |
| 330 | + ndi.get_global_id(1)); |
| 331 | + }); |
| 332 | + }); |
| 333 | + e.wait(); |
| 334 | + etime = esimd_test::report_time("kernel time", e, e); |
| 335 | + } |
| 336 | + |
| 337 | + if (i > 0) |
| 338 | + kernel_times += etime; |
| 339 | + else |
| 340 | + start = timer.Elapsed(); |
| 341 | + } |
| 342 | + } catch (cl::sycl::exception const &e) { |
| 343 | + std::cout << "SYCL exception caught: " << e.what() << '\n'; |
| 344 | + delete[] M; |
| 345 | + return e.get_cl_code(); |
| 346 | + } |
| 347 | + |
| 348 | + // End timer. |
| 349 | + double end = timer.Elapsed(); |
| 350 | + |
| 351 | + float total_time = (end - start) * 1000.0f / num_iters; |
| 352 | + float kernel_time = kernel_times / num_iters; |
| 353 | + |
| 354 | + float bandwidth_total = |
| 355 | + 2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / total_time; |
| 356 | + float bandwidth_kernel = |
| 357 | + 2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / kernel_time; |
| 358 | + |
| 359 | + cerr << "GPU transposition time = " << total_time << " msec\n"; |
| 360 | + cerr << "GPU transposition bandwidth = " << bandwidth_total << " GB/s\n"; |
| 361 | + cerr << "GPU kernel time = " << kernel_time << " msec\n"; |
| 362 | + cerr << "GPU kernel bandwidth = " << bandwidth_kernel << " GB/s\n"; |
| 363 | + |
| 364 | + // printMatrix("\nTransposed matrix:", M, MZ); |
| 365 | + bool success = checkResult(M, MZ); |
| 366 | + delete[] M; |
| 367 | + return success; |
| 368 | +} |
| 369 | + |
| 370 | +int main(int argc, char *argv[]) { |
| 371 | + unsigned MZ = 1U << 5; |
| 372 | + if (argc >= 2) { |
| 373 | + unsigned exponent = atoi(argv[1]); |
| 374 | + MZ = (MZ > (1U << exponent)) ? MZ : (1U << exponent); |
| 375 | + MZ = (MZ < (1U << 12)) ? MZ : (1U << 12); |
| 376 | + } |
| 377 | + |
| 378 | + bool success = true; |
| 379 | + success &= runTest(MZ, 16); |
| 380 | + if (argc == 1) { |
| 381 | + success &= runTest(1U << 10, 8); |
| 382 | + success &= runTest(1U << 11, 8); |
| 383 | + success &= runTest(1U << 12, 8); |
| 384 | + // success &= runTest(1U << 13, 8); |
| 385 | + success &= runTest(1U << 10, 16); |
| 386 | + success &= runTest(1U << 11, 16); |
| 387 | + success &= runTest(1U << 12, 16); |
| 388 | + // success &= runTest(1U << 13, 16); |
| 389 | + } |
| 390 | + |
| 391 | + cerr << (success ? "PASSED\n" : "FAILED\n"); |
| 392 | + return !success; |
| 393 | +} |
0 commit comments