Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit de991b1

Browse files
authored
[SYCL][ESIMD] Print performance data for matrix transpose tests in uniform way (#730)
Signed-off-by: Sergey Dmitriev <[email protected]>
1 parent a870304 commit de991b1

File tree

4 files changed

+74
-96
lines changed

4 files changed

+74
-96
lines changed

SYCL/ESIMD/matrix_transpose.cpp

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,8 @@ ESIMD_INLINE void transpose16(AccessorTy buf, int MZ, int block_col,
269269
}
270270
}
271271

272-
bool runTest(unsigned MZ, unsigned block_size) {
272+
bool runTest(unsigned MZ, unsigned block_size, unsigned num_iters,
273+
double &kernel_times, double &total_times) {
273274
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
274275
property::queue::enable_profiling{});
275276
int *M = new int[MZ * MZ];
@@ -295,8 +296,6 @@ bool runTest(unsigned MZ, unsigned block_size) {
295296
double start;
296297

297298
// Launches the task on the GPU.
298-
double kernel_times = 0;
299-
unsigned num_iters = 10;
300299

301300
try {
302301
// num_iters + 1, iteration#0 is for warmup
@@ -343,18 +342,7 @@ bool runTest(unsigned MZ, unsigned block_size) {
343342
// End timer.
344343
double end = timer.Elapsed();
345344

346-
float total_time = (end - start) * 1000.0f / num_iters;
347-
float kernel_time = kernel_times / num_iters;
348-
349-
float bandwidth_total =
350-
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / total_time;
351-
float bandwidth_kernel =
352-
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / kernel_time;
353-
354-
cerr << "GPU transposition time = " << total_time << " msec\n";
355-
cerr << "GPU transposition bandwidth = " << bandwidth_total << " GB/s\n";
356-
cerr << "GPU kernel time = " << kernel_time << " msec\n";
357-
cerr << "GPU kernel bandwidth = " << bandwidth_kernel << " GB/s\n";
345+
total_times += (end - start) * 1000.0f;
358346

359347
// printMatrix("\nTransposed matrix:", M, MZ);
360348
bool success = checkResult(M, MZ);
@@ -370,19 +358,25 @@ int main(int argc, char *argv[]) {
370358
MZ = (MZ < (1U << 12)) ? MZ : (1U << 12);
371359
}
372360

361+
unsigned num_iters = 10;
362+
double kernel_times = 0;
363+
double total_times = 0;
364+
373365
bool success = true;
374-
success &= runTest(MZ, 16);
366+
success &= runTest(MZ, 16, num_iters, kernel_times, total_times);
375367
if (argc == 1) {
376-
success &= runTest(1U << 7, 8);
377-
success &= runTest(1U << 8, 8);
378-
success &= runTest(1U << 9, 8);
379-
// success &= runTest(1U << 13, 8);
380-
success &= runTest(1U << 7, 16);
381-
success &= runTest(1U << 8, 16);
382-
success &= runTest(1U << 9, 16);
383-
// success &= runTest(1U << 13, 16);
368+
success &= runTest(1U << 7, 8, num_iters, kernel_times, total_times);
369+
success &= runTest(1U << 8, 8, num_iters, kernel_times, total_times);
370+
success &= runTest(1U << 9, 8, num_iters, kernel_times, total_times);
371+
// success &= runTest(1U << 13, 8, num_iters, kernel_times, total_times);
372+
success &= runTest(1U << 7, 16, num_iters, kernel_times, total_times);
373+
success &= runTest(1U << 8, 16, num_iters, kernel_times, total_times);
374+
success &= runTest(1U << 9, 16, num_iters, kernel_times, total_times);
375+
// success &= runTest(1U << 13, 16, num_iters, kernel_times, total_times);
384376
}
385377

378+
esimd_test::display_timing_stats(kernel_times, num_iters, total_times);
379+
386380
cerr << (success ? "PASSED\n" : "FAILED\n");
387381
return !success;
388382
}

SYCL/ESIMD/matrix_transpose2.cpp

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,8 @@ ESIMD_INLINE void transpose16(AccessorInTy in, AccessorOutTy out, int MZ,
265265
}
266266
}
267267

268-
bool runTest(unsigned MZ, unsigned block_size) {
268+
bool runTest(unsigned MZ, unsigned block_size, unsigned num_iters,
269+
double &kernel_times, double &total_times) {
269270
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
270271
property::queue::enable_profiling{});
271272
int *M = new int[MZ * MZ];
@@ -291,8 +292,6 @@ bool runTest(unsigned MZ, unsigned block_size) {
291292
double start;
292293

293294
// Launches the task on the GPU.
294-
double kernel_times = 0;
295-
unsigned num_iters = 10;
296295

297296
try {
298297
// num_iters + 1, iteration#0 is for warmup
@@ -344,18 +343,7 @@ bool runTest(unsigned MZ, unsigned block_size) {
344343
// End timer.
345344
double end = timer.Elapsed();
346345

347-
float total_time = (end - start) * 1000.0f / num_iters;
348-
float kernel_time = kernel_times / num_iters;
349-
350-
float bandwidth_total =
351-
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / total_time;
352-
float bandwidth_kernel =
353-
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / kernel_time;
354-
355-
cerr << "GPU transposition time = " << total_time << " msec\n";
356-
cerr << "GPU transposition bandwidth = " << bandwidth_total << " GB/s\n";
357-
cerr << "GPU kernel time = " << kernel_time << " msec\n";
358-
cerr << "GPU kernel bandwidth = " << bandwidth_kernel << " GB/s\n";
346+
total_times += (end - start) * 1000.0f;
359347

360348
// printMatrix("\nTransposed matrix:", M, MZ);
361349
bool success = checkResult(M, MZ);
@@ -371,19 +359,25 @@ int main(int argc, char *argv[]) {
371359
MZ = (MZ < (1U << 12)) ? MZ : (1U << 12);
372360
}
373361

362+
unsigned num_iters = 10;
363+
double kernel_times = 0;
364+
double total_times = 0;
365+
374366
bool success = true;
375-
success &= runTest(MZ, 16);
367+
success &= runTest(MZ, 16, num_iters, kernel_times, total_times);
376368
if (argc == 1) {
377-
success &= runTest(1U << 10, 8);
378-
success &= runTest(1U << 11, 8);
379-
success &= runTest(1U << 12, 8);
380-
// success &= runTest(1U << 13, 8);
381-
success &= runTest(1U << 10, 16);
382-
success &= runTest(1U << 11, 16);
383-
success &= runTest(1U << 12, 16);
384-
// success &= runTest(1U << 13, 16);
369+
success &= runTest(1U << 10, 8, num_iters, kernel_times, total_times);
370+
success &= runTest(1U << 11, 8, num_iters, kernel_times, total_times);
371+
success &= runTest(1U << 12, 8, num_iters, kernel_times, total_times);
372+
// success &= runTest(1U << 13, 8, num_iters, kernel_times, total_times);
373+
success &= runTest(1U << 10, 16, num_iters, kernel_times, total_times);
374+
success &= runTest(1U << 11, 16, num_iters, kernel_times, total_times);
375+
success &= runTest(1U << 12, 16, num_iters, kernel_times, total_times);
376+
// success &= runTest(1U << 13, 16, num_iters, kernel_times, total_times);
385377
}
386378

379+
esimd_test::display_timing_stats(kernel_times, num_iters, total_times);
380+
387381
cerr << (success ? "PASSED\n" : "FAILED\n");
388382
return !success;
389383
}

SYCL/ESIMD/matrix_transpose_glb.cpp

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,8 @@ ESIMD_INLINE void transpose16(int *buf, int MZ, int block_col, int block_row) {
236236
}
237237
}
238238

239-
bool runTest(unsigned MZ, unsigned block_size) {
239+
bool runTest(unsigned MZ, unsigned block_size, unsigned num_iters,
240+
double &kernel_times, double &total_times) {
240241
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
241242
property::queue::enable_profiling{});
242243
int *M = malloc_shared<int>(MZ * MZ, q);
@@ -262,8 +263,6 @@ bool runTest(unsigned MZ, unsigned block_size) {
262263
double start;
263264

264265
// Launches the task on the GPU.
265-
double kernel_times = 0;
266-
unsigned num_iters = 10;
267266

268267
try {
269268
// num_iters + 1, iteration#0 is for warmup
@@ -303,18 +302,7 @@ bool runTest(unsigned MZ, unsigned block_size) {
303302
// End timer.
304303
double end = timer.Elapsed();
305304

306-
float total_time = (end - start) * 1000.0f / num_iters;
307-
float kernel_time = kernel_times / num_iters;
308-
309-
float bandwidth_total =
310-
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / total_time;
311-
float bandwidth_kernel =
312-
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / kernel_time;
313-
314-
cerr << "GPU transposition time = " << total_time << " msec\n";
315-
cerr << "GPU transposition bandwidth = " << bandwidth_total << " GB/s\n";
316-
cerr << "GPU kernel time = " << kernel_time << " msec\n";
317-
cerr << "GPU kernel bandwidth = " << bandwidth_kernel << " GB/s\n";
305+
total_times += (end - start) * 1000.0f;
318306

319307
// printMatrix("\nTransposed matrix:", M, MZ);
320308
bool success = checkResult(M, MZ);
@@ -330,19 +318,25 @@ int main(int argc, char *argv[]) {
330318
MZ = (MZ < (1U << 12)) ? MZ : (1U << 12);
331319
}
332320

321+
unsigned num_iters = 10;
322+
double kernel_times = 0;
323+
double total_times = 0;
324+
333325
bool success = true;
334-
success &= runTest(MZ, 16);
326+
success &= runTest(MZ, 16, num_iters, kernel_times, total_times);
335327
if (argc == 1) {
336-
success &= runTest(1U << 10, 8);
337-
success &= runTest(1U << 11, 8);
338-
success &= runTest(1U << 12, 8);
339-
// success &= runTest(1U << 13, 8);
340-
success &= runTest(1U << 10, 16);
341-
success &= runTest(1U << 11, 16);
342-
success &= runTest(1U << 12, 16);
343-
// success &= runTest(1U << 13, 16);
328+
success &= runTest(1U << 10, 8, num_iters, kernel_times, total_times);
329+
success &= runTest(1U << 11, 8, num_iters, kernel_times, total_times);
330+
success &= runTest(1U << 12, 8, num_iters, kernel_times, total_times);
331+
// success &= runTest(1U << 13, 8, num_iters, kernel_times, total_times);
332+
success &= runTest(1U << 10, 16, num_iters, kernel_times, total_times);
333+
success &= runTest(1U << 11, 16, num_iters, kernel_times, total_times);
334+
success &= runTest(1U << 12, 16, num_iters, kernel_times, total_times);
335+
// success &= runTest(1U << 13, 16, num_iters, kernel_times, total_times);
344336
}
345337

338+
esimd_test::display_timing_stats(kernel_times, num_iters, total_times);
339+
346340
cerr << (success ? "PASSED\n" : "FAILED\n");
347341
return !success;
348342
}

SYCL/ESIMD/matrix_transpose_usm.cpp

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,8 @@ ESIMD_INLINE void transpose16(int *buf, int MZ, int block_col, int block_row) {
241241
}
242242
}
243243

244-
bool runTest(queue &q, unsigned MZ, unsigned block_size) {
244+
bool runTest(queue &q, unsigned MZ, unsigned block_size, unsigned num_iters,
245+
double &kernel_times, double &total_times) {
245246
int *M = malloc_shared<int>(MZ * MZ, q);
246247

247248
initMatrix(M, MZ);
@@ -265,8 +266,6 @@ bool runTest(queue &q, unsigned MZ, unsigned block_size) {
265266
double start;
266267

267268
// Launches the task on the GPU.
268-
double kernel_times = 0;
269-
unsigned num_iters = 10;
270269

271270
try {
272271
// num_iters + 1, iteration#0 is for warmup
@@ -306,18 +305,7 @@ bool runTest(queue &q, unsigned MZ, unsigned block_size) {
306305
// End timer.
307306
double end = timer.Elapsed();
308307

309-
float total_time = (end - start) * 1000.0f / num_iters;
310-
float kernel_time = kernel_times / num_iters;
311-
312-
float bandwidth_total =
313-
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / total_time;
314-
float bandwidth_kernel =
315-
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / kernel_time;
316-
317-
cerr << "GPU transposition time = " << total_time << " msec\n";
318-
cerr << "GPU transposition bandwidth = " << bandwidth_total << " GB/s\n";
319-
cerr << "GPU kernel time = " << kernel_time << " msec\n";
320-
cerr << "GPU kernel bandwidth = " << bandwidth_kernel << " GB/s\n";
308+
total_times += (end - start) * 1000.0f / num_iters;
321309

322310
// printMatrix("\nTransposed matrix:", M, MZ);
323311
bool success = checkResult(M, MZ);
@@ -335,19 +323,27 @@ int main(int argc, char *argv[]) {
335323

336324
queue Q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
337325
property::queue::enable_profiling{});
326+
327+
unsigned num_iters = 10;
328+
double kernel_times = 0;
329+
double total_times = 0;
330+
338331
bool success = true;
339-
success &= runTest(Q, MZ, 16);
332+
success &= runTest(Q, MZ, 16, num_iters, kernel_times, total_times);
340333
if (argc == 1) {
341-
success &= runTest(Q, 1U << 7, 8);
342-
success &= runTest(Q, 1U << 8, 8);
343-
success &= runTest(Q, 1U << 9, 8);
344-
// success &= runTest(Q, 1U << 13, 8);
345-
success &= runTest(Q, 1U << 7, 16);
346-
success &= runTest(Q, 1U << 8, 16);
347-
success &= runTest(Q, 1U << 9, 16);
348-
// success &= runTest(Q, 1U << 13, 16);
334+
success &= runTest(Q, 1U << 7, 8, num_iters, kernel_times, total_times);
335+
success &= runTest(Q, 1U << 8, 8, num_iters, kernel_times, total_times);
336+
success &= runTest(Q, 1U << 9, 8, num_iters, kernel_times, total_times);
337+
// success &= runTest(Q, 1U << 13, 8, num_iters, kernel_times, total_times);
338+
success &= runTest(Q, 1U << 7, 16, num_iters, kernel_times, total_times);
339+
success &= runTest(Q, 1U << 8, 16, num_iters, kernel_times, total_times);
340+
success &= runTest(Q, 1U << 9, 16, num_iters, kernel_times, total_times);
341+
// success &= runTest(Q, 1U << 13, 16, num_iters, kernel_times,
342+
// total_times);
349343
}
350344

345+
esimd_test::display_timing_stats(kernel_times, num_iters, total_times);
346+
351347
cerr << (success ? "PASSED\n" : "FAILED\n");
352348
return !success;
353349
}

0 commit comments

Comments
 (0)