Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][ESIMD] Print performance data for matrix transpose tests in uniform way #730

Merged
merged 1 commit into from
Jan 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 18 additions & 24 deletions SYCL/ESIMD/matrix_transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ ESIMD_INLINE void transpose16(AccessorTy buf, int MZ, int block_col,
}
}

bool runTest(unsigned MZ, unsigned block_size) {
bool runTest(unsigned MZ, unsigned block_size, unsigned num_iters,
double &kernel_times, double &total_times) {
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
property::queue::enable_profiling{});
int *M = new int[MZ * MZ];
Expand All @@ -295,8 +296,6 @@ bool runTest(unsigned MZ, unsigned block_size) {
double start;

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;

try {
// num_iters + 1, iteration#0 is for warmup
Expand Down Expand Up @@ -343,18 +342,7 @@ bool runTest(unsigned MZ, unsigned block_size) {
// End timer.
double end = timer.Elapsed();

float total_time = (end - start) * 1000.0f / num_iters;
float kernel_time = kernel_times / num_iters;

float bandwidth_total =
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / total_time;
float bandwidth_kernel =
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / kernel_time;

cerr << "GPU transposition time = " << total_time << " msec\n";
cerr << "GPU transposition bandwidth = " << bandwidth_total << " GB/s\n";
cerr << "GPU kernel time = " << kernel_time << " msec\n";
cerr << "GPU kernel bandwidth = " << bandwidth_kernel << " GB/s\n";
total_times += (end - start) * 1000.0f;

// printMatrix("\nTransposed matrix:", M, MZ);
bool success = checkResult(M, MZ);
Expand All @@ -370,19 +358,25 @@ int main(int argc, char *argv[]) {
MZ = (MZ < (1U << 12)) ? MZ : (1U << 12);
}

unsigned num_iters = 10;
double kernel_times = 0;
double total_times = 0;

bool success = true;
success &= runTest(MZ, 16);
success &= runTest(MZ, 16, num_iters, kernel_times, total_times);
if (argc == 1) {
success &= runTest(1U << 7, 8);
success &= runTest(1U << 8, 8);
success &= runTest(1U << 9, 8);
// success &= runTest(1U << 13, 8);
success &= runTest(1U << 7, 16);
success &= runTest(1U << 8, 16);
success &= runTest(1U << 9, 16);
// success &= runTest(1U << 13, 16);
success &= runTest(1U << 7, 8, num_iters, kernel_times, total_times);
success &= runTest(1U << 8, 8, num_iters, kernel_times, total_times);
success &= runTest(1U << 9, 8, num_iters, kernel_times, total_times);
// success &= runTest(1U << 13, 8, num_iters, kernel_times, total_times);
success &= runTest(1U << 7, 16, num_iters, kernel_times, total_times);
success &= runTest(1U << 8, 16, num_iters, kernel_times, total_times);
success &= runTest(1U << 9, 16, num_iters, kernel_times, total_times);
// success &= runTest(1U << 13, 16, num_iters, kernel_times, total_times);
}

esimd_test::display_timing_stats(kernel_times, num_iters, total_times);

cerr << (success ? "PASSED\n" : "FAILED\n");
return !success;
}
42 changes: 18 additions & 24 deletions SYCL/ESIMD/matrix_transpose2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,8 @@ ESIMD_INLINE void transpose16(AccessorInTy in, AccessorOutTy out, int MZ,
}
}

bool runTest(unsigned MZ, unsigned block_size) {
bool runTest(unsigned MZ, unsigned block_size, unsigned num_iters,
double &kernel_times, double &total_times) {
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
property::queue::enable_profiling{});
int *M = new int[MZ * MZ];
Expand All @@ -291,8 +292,6 @@ bool runTest(unsigned MZ, unsigned block_size) {
double start;

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;

try {
// num_iters + 1, iteration#0 is for warmup
Expand Down Expand Up @@ -344,18 +343,7 @@ bool runTest(unsigned MZ, unsigned block_size) {
// End timer.
double end = timer.Elapsed();

float total_time = (end - start) * 1000.0f / num_iters;
float kernel_time = kernel_times / num_iters;

float bandwidth_total =
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / total_time;
float bandwidth_kernel =
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / kernel_time;

cerr << "GPU transposition time = " << total_time << " msec\n";
cerr << "GPU transposition bandwidth = " << bandwidth_total << " GB/s\n";
cerr << "GPU kernel time = " << kernel_time << " msec\n";
cerr << "GPU kernel bandwidth = " << bandwidth_kernel << " GB/s\n";
total_times += (end - start) * 1000.0f;

// printMatrix("\nTransposed matrix:", M, MZ);
bool success = checkResult(M, MZ);
Expand All @@ -371,19 +359,25 @@ int main(int argc, char *argv[]) {
MZ = (MZ < (1U << 12)) ? MZ : (1U << 12);
}

unsigned num_iters = 10;
double kernel_times = 0;
double total_times = 0;

bool success = true;
success &= runTest(MZ, 16);
success &= runTest(MZ, 16, num_iters, kernel_times, total_times);
if (argc == 1) {
success &= runTest(1U << 10, 8);
success &= runTest(1U << 11, 8);
success &= runTest(1U << 12, 8);
// success &= runTest(1U << 13, 8);
success &= runTest(1U << 10, 16);
success &= runTest(1U << 11, 16);
success &= runTest(1U << 12, 16);
// success &= runTest(1U << 13, 16);
success &= runTest(1U << 10, 8, num_iters, kernel_times, total_times);
success &= runTest(1U << 11, 8, num_iters, kernel_times, total_times);
success &= runTest(1U << 12, 8, num_iters, kernel_times, total_times);
// success &= runTest(1U << 13, 8, num_iters, kernel_times, total_times);
success &= runTest(1U << 10, 16, num_iters, kernel_times, total_times);
success &= runTest(1U << 11, 16, num_iters, kernel_times, total_times);
success &= runTest(1U << 12, 16, num_iters, kernel_times, total_times);
// success &= runTest(1U << 13, 16, num_iters, kernel_times, total_times);
}

esimd_test::display_timing_stats(kernel_times, num_iters, total_times);

cerr << (success ? "PASSED\n" : "FAILED\n");
return !success;
}
42 changes: 18 additions & 24 deletions SYCL/ESIMD/matrix_transpose_glb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,8 @@ ESIMD_INLINE void transpose16(int *buf, int MZ, int block_col, int block_row) {
}
}

bool runTest(unsigned MZ, unsigned block_size) {
bool runTest(unsigned MZ, unsigned block_size, unsigned num_iters,
double &kernel_times, double &total_times) {
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
property::queue::enable_profiling{});
int *M = malloc_shared<int>(MZ * MZ, q);
Expand All @@ -262,8 +263,6 @@ bool runTest(unsigned MZ, unsigned block_size) {
double start;

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;

try {
// num_iters + 1, iteration#0 is for warmup
Expand Down Expand Up @@ -303,18 +302,7 @@ bool runTest(unsigned MZ, unsigned block_size) {
// End timer.
double end = timer.Elapsed();

float total_time = (end - start) * 1000.0f / num_iters;
float kernel_time = kernel_times / num_iters;

float bandwidth_total =
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / total_time;
float bandwidth_kernel =
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / kernel_time;

cerr << "GPU transposition time = " << total_time << " msec\n";
cerr << "GPU transposition bandwidth = " << bandwidth_total << " GB/s\n";
cerr << "GPU kernel time = " << kernel_time << " msec\n";
cerr << "GPU kernel bandwidth = " << bandwidth_kernel << " GB/s\n";
total_times += (end - start) * 1000.0f;

// printMatrix("\nTransposed matrix:", M, MZ);
bool success = checkResult(M, MZ);
Expand All @@ -330,19 +318,25 @@ int main(int argc, char *argv[]) {
MZ = (MZ < (1U << 12)) ? MZ : (1U << 12);
}

unsigned num_iters = 10;
double kernel_times = 0;
double total_times = 0;

bool success = true;
success &= runTest(MZ, 16);
success &= runTest(MZ, 16, num_iters, kernel_times, total_times);
if (argc == 1) {
success &= runTest(1U << 10, 8);
success &= runTest(1U << 11, 8);
success &= runTest(1U << 12, 8);
// success &= runTest(1U << 13, 8);
success &= runTest(1U << 10, 16);
success &= runTest(1U << 11, 16);
success &= runTest(1U << 12, 16);
// success &= runTest(1U << 13, 16);
success &= runTest(1U << 10, 8, num_iters, kernel_times, total_times);
success &= runTest(1U << 11, 8, num_iters, kernel_times, total_times);
success &= runTest(1U << 12, 8, num_iters, kernel_times, total_times);
// success &= runTest(1U << 13, 8, num_iters, kernel_times, total_times);
success &= runTest(1U << 10, 16, num_iters, kernel_times, total_times);
success &= runTest(1U << 11, 16, num_iters, kernel_times, total_times);
success &= runTest(1U << 12, 16, num_iters, kernel_times, total_times);
// success &= runTest(1U << 13, 16, num_iters, kernel_times, total_times);
}

esimd_test::display_timing_stats(kernel_times, num_iters, total_times);

cerr << (success ? "PASSED\n" : "FAILED\n");
return !success;
}
44 changes: 20 additions & 24 deletions SYCL/ESIMD/matrix_transpose_usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@ ESIMD_INLINE void transpose16(int *buf, int MZ, int block_col, int block_row) {
}
}

bool runTest(queue &q, unsigned MZ, unsigned block_size) {
bool runTest(queue &q, unsigned MZ, unsigned block_size, unsigned num_iters,
double &kernel_times, double &total_times) {
int *M = malloc_shared<int>(MZ * MZ, q);

initMatrix(M, MZ);
Expand All @@ -265,8 +266,6 @@ bool runTest(queue &q, unsigned MZ, unsigned block_size) {
double start;

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;

try {
// num_iters + 1, iteration#0 is for warmup
Expand Down Expand Up @@ -306,18 +305,7 @@ bool runTest(queue &q, unsigned MZ, unsigned block_size) {
// End timer.
double end = timer.Elapsed();

float total_time = (end - start) * 1000.0f / num_iters;
float kernel_time = kernel_times / num_iters;

float bandwidth_total =
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / total_time;
float bandwidth_kernel =
2.0f * 1000 * sizeof(int) * MZ * MZ / (1024 * 1024 * 1024) / kernel_time;

cerr << "GPU transposition time = " << total_time << " msec\n";
cerr << "GPU transposition bandwidth = " << bandwidth_total << " GB/s\n";
cerr << "GPU kernel time = " << kernel_time << " msec\n";
cerr << "GPU kernel bandwidth = " << bandwidth_kernel << " GB/s\n";
total_times += (end - start) * 1000.0f / num_iters;

// printMatrix("\nTransposed matrix:", M, MZ);
bool success = checkResult(M, MZ);
Expand All @@ -335,19 +323,27 @@ int main(int argc, char *argv[]) {

queue Q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
property::queue::enable_profiling{});

unsigned num_iters = 10;
double kernel_times = 0;
double total_times = 0;

bool success = true;
success &= runTest(Q, MZ, 16);
success &= runTest(Q, MZ, 16, num_iters, kernel_times, total_times);
if (argc == 1) {
success &= runTest(Q, 1U << 7, 8);
success &= runTest(Q, 1U << 8, 8);
success &= runTest(Q, 1U << 9, 8);
// success &= runTest(Q, 1U << 13, 8);
success &= runTest(Q, 1U << 7, 16);
success &= runTest(Q, 1U << 8, 16);
success &= runTest(Q, 1U << 9, 16);
// success &= runTest(Q, 1U << 13, 16);
success &= runTest(Q, 1U << 7, 8, num_iters, kernel_times, total_times);
success &= runTest(Q, 1U << 8, 8, num_iters, kernel_times, total_times);
success &= runTest(Q, 1U << 9, 8, num_iters, kernel_times, total_times);
// success &= runTest(Q, 1U << 13, 8, num_iters, kernel_times, total_times);
success &= runTest(Q, 1U << 7, 16, num_iters, kernel_times, total_times);
success &= runTest(Q, 1U << 8, 16, num_iters, kernel_times, total_times);
success &= runTest(Q, 1U << 9, 16, num_iters, kernel_times, total_times);
// success &= runTest(Q, 1U << 13, 16, num_iters, kernel_times,
// total_times);
}

esimd_test::display_timing_stats(kernel_times, num_iters, total_times);

cerr << (success ? "PASSED\n" : "FAILED\n");
return !success;
}