Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit b5ea11e

Browse files
authored
[ESIMD] Add improved version of matrix transpose test. (#346)
* [ESIMD] Add improved version of matrix transpose test.
1 parent 6dff186 commit b5ea11e

File tree

1 file changed

+393
-0
lines changed

1 file changed

+393
-0
lines changed

SYCL/ESIMD/matrix_transpose2.cpp

Lines changed: 393 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,393 @@
1+
//==--------------- matrix_transpose2.cpp - DPC++ ESIMD on-device test ----==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu
9+
// UNSUPPORTED: cuda
10+
// RUN: %clangxx -fsycl %s -o %t.out
11+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
12+
13+
// This test checks matrix transpose implementation with media block read/write
14+
15+
#include "esimd_test_utils.hpp"
16+
17+
#include <CL/sycl.hpp>
18+
#include <CL/sycl/INTEL/esimd.hpp>
19+
#include <iostream>
20+
21+
using namespace cl::sycl;
22+
using namespace std;
23+
using namespace sycl::ext::intel::experimental::esimd;
24+
25+
void initMatrix(int *M, unsigned N) {
26+
assert(N >= 8 && (((N - 1) & N) == 0) &&
27+
"only power of 2 (>= 16) is supported");
28+
for (unsigned i = 0; i < N * N; ++i)
29+
M[i] = i;
30+
}
31+
32+
void printMatrix(const char *msg, int *M, unsigned N) {
33+
cerr << msg << "\n";
34+
if (N > 64) {
35+
cerr << "<<<maitrix of size " << N << " x " << N << ">>>\n";
36+
return;
37+
}
38+
39+
for (unsigned i = 0; i < N; ++i) {
40+
for (unsigned j = 0; j < N; ++j) {
41+
cerr.width(4);
42+
cerr << M[i * N + j] << " ";
43+
}
44+
cerr << "\n";
45+
}
46+
}
47+
48+
bool checkResult(const int *M, unsigned N) {
49+
for (unsigned i = 0; i < N; ++i) {
50+
for (unsigned j = 0; j < N; ++j) {
51+
unsigned t = M[j * N + i];
52+
if (t != i * N + j) {
53+
cerr << "Error at M(" << i << ", " << j << ") = " << t << "\n";
54+
return false;
55+
}
56+
}
57+
}
58+
return true;
59+
}
60+
61+
// The basic idea of vecotrizing transposition can be illustrated by
62+
// transposing a 2 x 2 matrix as follows:
63+
//
64+
// A B
65+
// C D
66+
// ==>
67+
// merge([A, A, B, B], [C, C, D, D], 0b1010) = [A, C, B, D]
68+
// ==>
69+
// A C
70+
// B D
71+
//
72+
template <typename T>
73+
ESIMD_INLINE simd<T, 64> transpose_matrix(simd<T, 64> v1) {
74+
simd<T, 64> v2;
75+
// mask to control how to merge two vectors.
76+
simd<uint16_t, 16> mask = 0;
77+
mask.select<8, 2>(0) = 1;
78+
auto t1 = v1.template bit_cast_view<T, 4, 16>();
79+
auto t2 = v2.template bit_cast_view<T, 4, 16>();
80+
81+
// j = 1
82+
t2.row(0).merge(t1.template replicate<8, 1, 2, 0>(0, 0),
83+
t1.template replicate<8, 1, 2, 0>(2, 0), mask);
84+
t2.row(1).merge(t1.template replicate<8, 1, 2, 0>(0, 8),
85+
t1.template replicate<8, 1, 2, 0>(2, 8), mask);
86+
t2.row(2).merge(t1.template replicate<8, 1, 2, 0>(1, 0),
87+
t1.template replicate<8, 1, 2, 0>(3, 0), mask);
88+
t2.row(3).merge(t1.template replicate<8, 1, 2, 0>(1, 8),
89+
t1.template replicate<8, 1, 2, 0>(3, 8), mask);
90+
91+
// j = 2
92+
t1.row(0).merge(t2.template replicate<8, 1, 2, 0>(0, 0),
93+
t2.template replicate<8, 1, 2, 0>(2, 0), mask);
94+
t1.row(1).merge(t2.template replicate<8, 1, 2, 0>(0, 8),
95+
t2.template replicate<8, 1, 2, 0>(2, 8), mask);
96+
t1.row(2).merge(t2.template replicate<8, 1, 2, 0>(1, 0),
97+
t2.template replicate<8, 1, 2, 0>(3, 0), mask);
98+
t1.row(3).merge(t2.template replicate<8, 1, 2, 0>(1, 8),
99+
t2.template replicate<8, 1, 2, 0>(3, 8), mask);
100+
101+
// j = 4
102+
t2.row(0).merge(t1.template replicate<8, 1, 2, 0>(0, 0),
103+
t1.template replicate<8, 1, 2, 0>(2, 0), mask);
104+
t2.row(1).merge(t1.template replicate<8, 1, 2, 0>(0, 8),
105+
t1.template replicate<8, 1, 2, 0>(2, 8), mask);
106+
t2.row(2).merge(t1.template replicate<8, 1, 2, 0>(1, 0),
107+
t1.template replicate<8, 1, 2, 0>(3, 0), mask);
108+
t2.row(3).merge(t1.template replicate<8, 1, 2, 0>(1, 8),
109+
t1.template replicate<8, 1, 2, 0>(3, 8), mask);
110+
return v2;
111+
}
112+
113+
// read a N-by-N sub-matrix
114+
template <typename T, int N, typename AccessorTy>
115+
ESIMD_INLINE simd<T, N * N> read(AccessorTy img, int MZ, int col, int row) {
116+
static_assert(N == 8, "only 8x8 sub-matrix is supported");
117+
118+
simd<T, N * N> res;
119+
auto in = res.template bit_cast_view<unsigned char, 8, 32>();
120+
in = media_block_load<unsigned char, 8, 32>(img, col * sizeof(T), row);
121+
122+
return res;
123+
}
124+
125+
// write a N-by-N sub-matrix
126+
template <typename T, int N, typename AccessorTy>
127+
ESIMD_INLINE void write(AccessorTy img, int MZ, int col, int row,
128+
simd<T, N * N> val) {
129+
static_assert(N == 8, "only 8x8 sub-matrix is supported");
130+
131+
auto out = val.template bit_cast_view<uchar, 8, 32>();
132+
media_block_store<unsigned char, 8, 32>(img, col * sizeof(T), row, out);
133+
}
134+
135+
// Square matrix transposition on block of size 8x8
136+
// input and output are in the same image
137+
template <typename AccessorInTy, typename AccessorOutTy>
138+
ESIMD_INLINE void transpose8(AccessorInTy in, AccessorOutTy out, int MZ,
139+
int block_col, int block_row) {
140+
static const int N = 8;
141+
142+
if (block_row == block_col) {
143+
auto m1 = read<int, N, AccessorInTy>(in, MZ, N * block_col, N * block_row);
144+
145+
// cerr << m1 << std::endl;
146+
147+
auto t1 = transpose_matrix(m1);
148+
149+
// cerr << t1 << std::endl;
150+
151+
write<int, N, AccessorOutTy>(out, MZ, N * block_row, N * block_col, t1);
152+
} else if (block_row < block_col) {
153+
// Read two blocks to be swapped.
154+
auto m1 = read<int, N, AccessorInTy>(in, MZ, N * block_col, N * block_row);
155+
auto m2 = read<int, N, AccessorInTy>(in, MZ, N * block_row, N * block_col);
156+
157+
// Tranpose them.
158+
auto t1 = transpose_matrix(m1);
159+
auto t2 = transpose_matrix(m2);
160+
161+
// Write them back to the transposed location.
162+
write<int, N, AccessorOutTy>(out, MZ, N * block_row, N * block_col, t1);
163+
write<int, N, AccessorOutTy>(out, MZ, N * block_col, N * block_row, t2);
164+
}
165+
}
166+
167+
// Square matrix transposition on block of size 16x16.
168+
// In this version, a thread handle a block of size 16x16 which allows
169+
// to better latency hidding and subsentantially improve overall performance.
170+
//
171+
template <typename AccessorInTy, typename AccessorOutTy>
172+
ESIMD_INLINE void transpose16(AccessorInTy in, AccessorOutTy out, int MZ,
173+
int block_col, int block_row) {
174+
static const int N = 16;
175+
176+
if (block_row == block_col) {
177+
// Read a tile of 4 8x8 sub-blocks:
178+
//
179+
// [ m00 m01 ]
180+
// [ m10 m11 ]
181+
//
182+
// matrix<int, 8, 8> m00, m01, m10, m11, t00, t01, t10, t11;
183+
auto m00 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 0),
184+
N * block_row + 0);
185+
auto m01 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 8),
186+
N * block_row + 0);
187+
auto m10 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 0),
188+
N * block_row + 8);
189+
auto m11 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 8),
190+
N * block_row + 8);
191+
192+
// Tranpose sub-blocks.
193+
auto t00 = transpose_matrix(m00);
194+
auto t01 = transpose_matrix(m01);
195+
auto t10 = transpose_matrix(m10);
196+
auto t11 = transpose_matrix(m11);
197+
198+
// write out as
199+
//
200+
// [t00 t10]
201+
// [t01 t11]
202+
//
203+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 0),
204+
N * block_row + 0, t00);
205+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 8),
206+
N * block_row + 0, t10);
207+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 0),
208+
N * block_row + 8, t01);
209+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 8),
210+
N * block_row + 8, t11);
211+
} else if (block_row < block_col) {
212+
// Read two tiles of 4 8x8 sub-blocks to be swapped.
213+
//
214+
// matrix<int, 8, 8> a00, a01, a10, a11, t00, t01, t10, t11;
215+
auto a00 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 0),
216+
N * block_row + 0);
217+
auto a01 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 8),
218+
N * block_row + 0);
219+
auto a10 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 0),
220+
N * block_row + 8);
221+
auto a11 = read<int, 8, AccessorInTy>(in, MZ, (N * block_col + 8),
222+
N * block_row + 8);
223+
224+
// matrix<int, 8, 8> b00, b01, b10, b11, r00, r01, r10, r11;
225+
auto b00 = read<int, 8, AccessorInTy>(in, MZ, (N * block_row + 0),
226+
N * block_col + 0);
227+
auto b01 = read<int, 8, AccessorInTy>(in, MZ, (N * block_row + 8),
228+
N * block_col + 0);
229+
auto b10 = read<int, 8, AccessorInTy>(in, MZ, (N * block_row + 0),
230+
N * block_col + 8);
231+
auto b11 = read<int, 8, AccessorInTy>(in, MZ, (N * block_row + 8),
232+
N * block_col + 8);
233+
234+
// Tranpose the first tile.
235+
auto t00 = transpose_matrix(a00);
236+
auto t01 = transpose_matrix(a01);
237+
auto t10 = transpose_matrix(a10);
238+
auto t11 = transpose_matrix(a11);
239+
240+
// Tranpose the second tile.
241+
auto r00 = transpose_matrix(b00);
242+
auto r01 = transpose_matrix(b01);
243+
auto r10 = transpose_matrix(b10);
244+
auto r11 = transpose_matrix(b11);
245+
246+
// Write the first tile to the transposed location.
247+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_row + 0),
248+
N * block_col + 0, t00);
249+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_row + 8),
250+
N * block_col + 0, t10);
251+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_row + 0),
252+
N * block_col + 8, t01);
253+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_row + 8),
254+
N * block_col + 8, t11);
255+
256+
// Write the second tile to the transposed location.
257+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 0),
258+
N * block_row + 0, r00);
259+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 8),
260+
N * block_row + 0, r10);
261+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 0),
262+
N * block_row + 8, r01);
263+
write<int, 8, AccessorOutTy>(out, MZ, (N * block_col + 8),
264+
N * block_row + 8, r11);
265+
}
266+
}
267+
268+
bool runTest(unsigned MZ, unsigned block_size) {
269+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
270+
property::queue::enable_profiling{});
271+
int *M = new int[MZ * MZ];
272+
273+
initMatrix(M, MZ);
274+
cerr << "\nTranspose square matrix of size " << MZ << "\n";
275+
// printMatrix("Initial matrix:", M, MZ);
276+
277+
// Each C-for-Metal thread works on one or two blocks of size 8 x 8.
278+
int thread_width = MZ / block_size;
279+
int thread_height = MZ / block_size;
280+
281+
// create ranges
282+
// We need that many workitems
283+
auto GlobalRange = cl::sycl::range<2>(thread_width, thread_height);
284+
285+
// Number of workitems in a workgroup
286+
cl::sycl::range<2> LocalRange{1, 1};
287+
cl::sycl::nd_range<2> Range(GlobalRange, LocalRange);
288+
289+
// Start timer.
290+
esimd_test::Timer timer;
291+
double start;
292+
293+
// Launches the task on the GPU.
294+
double kernel_times = 0;
295+
unsigned num_iters = 10;
296+
297+
try {
298+
// num_iters + 1, iteration#0 is for warmup
299+
for (int i = 0; i <= num_iters; ++i) {
300+
// make sure that image object has short live-range
301+
// than M
302+
cl::sycl::image<2> imgM((unsigned int *)M, image_channel_order::rgba,
303+
image_channel_type::unsigned_int32,
304+
range<2>{MZ / 4, MZ});
305+
306+
double etime = 0;
307+
if (block_size == 16 && MZ >= 16) {
308+
auto e = q.submit([&](handler &cgh) {
309+
auto accInput =
310+
imgM.get_access<uint4, cl::sycl::access::mode::read>(cgh);
311+
auto accOutput =
312+
imgM.get_access<uint4, cl::sycl::access::mode::write>(cgh);
313+
cgh.parallel_for<class K16>(
314+
Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
315+
transpose16(accInput, accOutput, MZ, ndi.get_global_id(0),
316+
ndi.get_global_id(1));
317+
});
318+
});
319+
e.wait();
320+
etime = esimd_test::report_time("kernel time", e, e);
321+
} else if (block_size == 8) {
322+
auto e = q.submit([&](handler &cgh) {
323+
auto accInput =
324+
imgM.get_access<uint4, cl::sycl::access::mode::read>(cgh);
325+
auto accOutput =
326+
imgM.get_access<uint4, cl::sycl::access::mode::write>(cgh);
327+
cgh.parallel_for<class K08>(
328+
Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
329+
transpose8(accInput, accOutput, MZ, ndi.get_global_id(0),
330+
ndi.get_global_id(1));
331+
});
332+
});
333+
e.wait();
334+
etime = esimd_test::report_time("kernel time", e, e);
335+
}
336+
337+
if (i > 0)
338+
kernel_times += etime;
339+
else
340+
start = timer.Elapsed();
341+
}
342+
} catch (cl::sycl::exception const &e) {
343+
std::cout << "SYCL exception caught: " << e.what() << '\n';
344+
delete[] M;
345+
return e.get_cl_code();
346+
}
347+
348+
// End timer.
349+
double end = timer.Elapsed();
350+
351+
float total_time = (end - start) * 1000.0f / num_iters;
352+
float kernel_time = kernel_times / num_iters;
353+
354+
float bandwidth_total =
355+
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / total_time;
356+
float bandwidth_kernel =
357+
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / kernel_time;
358+
359+
cerr << "GPU transposition time = " << total_time << " msec\n";
360+
cerr << "GPU transposition bandwidth = " << bandwidth_total << " GB/s\n";
361+
cerr << "GPU kernel time = " << kernel_time << " msec\n";
362+
cerr << "GPU kernel bandwidth = " << bandwidth_kernel << " GB/s\n";
363+
364+
// printMatrix("\nTransposed matrix:", M, MZ);
365+
bool success = checkResult(M, MZ);
366+
delete[] M;
367+
return success;
368+
}
369+
370+
int main(int argc, char *argv[]) {
371+
unsigned MZ = 1U << 5;
372+
if (argc >= 2) {
373+
unsigned exponent = atoi(argv[1]);
374+
MZ = (MZ > (1U << exponent)) ? MZ : (1U << exponent);
375+
MZ = (MZ < (1U << 12)) ? MZ : (1U << 12);
376+
}
377+
378+
bool success = true;
379+
success &= runTest(MZ, 16);
380+
if (argc == 1) {
381+
success &= runTest(1U << 10, 8);
382+
success &= runTest(1U << 11, 8);
383+
success &= runTest(1U << 12, 8);
384+
// success &= runTest(1U << 13, 8);
385+
success &= runTest(1U << 10, 16);
386+
success &= runTest(1U << 11, 16);
387+
success &= runTest(1U << 12, 16);
388+
// success &= runTest(1U << 13, 16);
389+
}
390+
391+
cerr << (success ? "PASSED\n" : "FAILED\n");
392+
return !success;
393+
}

0 commit comments

Comments
 (0)