Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][ESIMD] Print performance data for histogram tests in uniform way #524

Merged
merged 6 commits into from
Oct 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions SYCL/ESIMD/esimd_test_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,4 +190,12 @@ inline double report_time(const std::string &msg, event e0, event en) {
return elapsed;
}

void display_timing_stats(double const kernelTime,
unsigned int const uiNumberOfIterations,
double const overallTime) {
std::cout << "Number of iterations: " << uiNumberOfIterations << "\n";
std::cout << "[KernelTime]:" << kernelTime << "\n";
std::cout << "[OverallTime][Primary]:" << overallTime << "\n";
}

} // namespace esimd_test
16 changes: 13 additions & 3 deletions SYCL/ESIMD/histogram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,16 @@ int main(int argc, char *argv[]) {
image_channel_type::unsigned_int32,
range<2>{width / sizeof(uint4), height});

// Start Timer
esimd_test::Timer timer;
double start;

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;

try {
// num_iters + 1, iteration#0 is for warmup
for (int iter = 0; iter <= num_iters; ++iter) {
double etime = 0;
for (int b = 0; b < NUM_BINS; b++)
Expand Down Expand Up @@ -215,17 +220,22 @@ int main(int argc, char *argv[]) {
etime = esimd_test::report_time("kernel time", e, e);
if (iter > 0)
kernel_times += etime;
else
start = timer.Elapsed();
}
// SYCL will enqueue and run the kernel. Recall that the buffer's data is
// given back to the host at the end of scope.
// make sure data is given back to the host at the end of this scope
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
return 1;
}

float kernel_time = kernel_times / num_iters;
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
// End timer.
double end = timer.Elapsed();

esimd_test::display_timing_stats(kernel_times, num_iters,
(end - start) * 1000);

writeHist(bins);
writeHist(cpuHistogram);
Expand Down
15 changes: 12 additions & 3 deletions SYCL/ESIMD/histogram_256_slm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ int main() {
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);

// Start Timer
esimd_test::Timer timer;
double start;

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;
Expand All @@ -174,14 +178,19 @@ int main() {
etime = esimd_test::report_time("kernel time", e, e);
if (iter > 0)
kernel_times += etime;
else
start = timer.Elapsed();
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
return 1;
}

float kernel_time = kernel_times / num_iters;
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
// End timer.
double end = timer.Elapsed();

esimd_test::display_timing_stats(kernel_times, num_iters,
(end - start) * 1000);

std::cout << "finish GPU histogram\n";

Expand Down
50 changes: 37 additions & 13 deletions SYCL/ESIMD/histogram_256_slm_spec_2020.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ class NumBlocksConst;
class histogram_slm;

int main(int argc, char **argv) {
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
property::queue::enable_profiling{});
auto dev = q.get_device();
auto ctxt = q.get_context();

Expand Down Expand Up @@ -160,23 +161,46 @@ int main(int argc, char **argv) {
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);

// Start Timer
esimd_test::Timer timer;
double start;

double kernel_times = 0;
unsigned num_iters = 10;
try {
auto e = q.submit([&](cl::sycl::handler &cgh) {
cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
cgh.parallel_for<histogram_slm>(
Range,
[=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
ndi.get_local_id(0), 16,
kh.get_specialization_constant<NumBlocksSpecId>());
});
});
e.wait();
// num_iters + 1, iteration#0 is for warmup
for (int iter = 0; iter <= num_iters; ++iter) {
double etime = 0;
memset(output_surface, 0, 4 * NUM_BINS);
auto e = q.submit([&](cl::sycl::handler &cgh) {
cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
cgh.parallel_for<histogram_slm>(
Range,
[=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
histogram_atomic(
input_ptr, output_surface, ndi.get_group(0),
ndi.get_local_id(0), 16,
kh.get_specialization_constant<NumBlocksSpecId>());
});
});
e.wait();
etime = esimd_test::report_time("kernel time", e, e);
if (iter > 0)
kernel_times += etime;
else
start = timer.Elapsed();
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
return 1;
}

// End timer.
double end = timer.Elapsed();

esimd_test::display_timing_stats(kernel_times, num_iters,
(end - start) * 1000);

std::cout << "finish GPU histogram\n";

memcpy(hist, output_surface, 4 * NUM_BINS);
Expand Down
144 changes: 85 additions & 59 deletions SYCL/ESIMD/histogram_2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ int main(int argc, char *argv[]) {
// Read in image luma plane

// Allocate Input Buffer
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
property::queue::enable_profiling{});

auto dev = q.get_device();
auto ctxt = q.get_context();
Expand Down Expand Up @@ -135,84 +136,109 @@ int main(int argc, char *argv[]) {
image_channel_type::unsigned_int32,
range<2>{width / sizeof(uint4), height});

try {
// create ranges
// We need that many workitems
auto GlobalRange = range<2>(range_width, range_height);
// Number of workitems in a workgroup
auto LocalRange = range<2>(1, 1);
nd_range<2> Range(GlobalRange, LocalRange);

auto e = q.submit([&](handler &cgh) {
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);

cgh.parallel_for<class Hist>(
Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;

// Get thread origin offsets
uint h_pos = ndi.get_group(0) * BLOCK_WIDTH;
uint v_pos = ndi.get_group(1) * BLOCK_HEIGHT;
// Start Timer
esimd_test::Timer timer;
double start;

// Declare a 8x32 uchar matrix to store the input block pixel value
simd<unsigned char, 8 * 32> in;

// Declare a vector to store the local histogram
simd<unsigned int, NUM_BINS> histogram(0);

// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
// Perform 2D media block read to load 8x32 pixel block
in =
media_block_load<unsigned char, 8, 32>(readAcc, h_pos, v_pos);

// Accumulate local histogram for each pixel value
double kernel_times = 0;
unsigned num_iters = 10;
try {
// num_iters + 1, iteration#0 is for warmup
for (int iter = 0; iter <= num_iters; ++iter) {
double etime = 0;
for (int b = 0; b < NUM_BINS; b++)
bins[b] = 0;
// create ranges
// We need that many workitems
auto GlobalRange = range<2>(range_width, range_height);
// Number of workitems in a workgroup
auto LocalRange = range<2>(1, 1);
nd_range<2> Range(GlobalRange, LocalRange);

auto e = q.submit([&](handler &cgh) {
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);

cgh.parallel_for<class Hist>(
Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;

// Get thread origin offsets
uint h_pos = ndi.get_group(0) * BLOCK_WIDTH;
uint v_pos = ndi.get_group(1) * BLOCK_HEIGHT;

// Declare a 8x32 uchar matrix to store the input block pixel
// value
simd<unsigned char, 8 * 32> in;

// Declare a vector to store the local histogram
simd<unsigned int, NUM_BINS> histogram(0);

// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
// Perform 2D media block read to load 8x32 pixel block
in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
v_pos);

// Accumulate local histogram for each pixel value
#pragma unroll
for (int i = 0; i < 8; i++) {
for (int i = 0; i < 8; i++) {
#pragma unroll
for (int j = 0; j < 32; j++) {
histogram.select<1, 1>(in[i * 32 + j]) += 1;
for (int j = 0; j < 32; j++) {
histogram.select<1, 1>(in[i * 32 + j]) += 1;
}
}
}

// Update starting offset for the next work block
v_pos += 8;
}
// Update starting offset for the next work block
v_pos += 8;
}

// Declare a vector to store the offset for atomic write operation
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
offset *= sizeof(unsigned int);
// Declare a vector to store the offset for atomic write operation
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
offset *= sizeof(unsigned int);

// Update global sum by atomically adding each local histogram
// Update global sum by atomically adding each local histogram
#pragma unroll
for (int i = 0; i < NUM_BINS; i += 8) {
// Declare a vector to store the source for atomic write operation
simd<unsigned int, 8> src;
src = histogram.select<8, 1>(i);
for (int i = 0; i < NUM_BINS; i += 8) {
// Declare a vector to store the source for atomic write
// operation
simd<unsigned int, 8> src;
src = histogram.select<8, 1>(i);

#ifdef __SYCL_DEVICE_ONLY__
flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
1);
offset += 8 * sizeof(unsigned int);
flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
1);
offset += 8 * sizeof(unsigned int);
#else
simd<unsigned int, 8> vals;
vals.copy_from(bins + i);
vals = vals + src;
vals.copy_to(bins + i);
simd<unsigned int, 8> vals;
vals.copy_from(bins + i);
vals = vals + src;
vals.copy_to(bins + i);
#endif
}
});
});
e.wait();
}
});
});
e.wait();
etime = esimd_test::report_time("kernel time", e, e);
if (iter > 0)
kernel_times += etime;
else
start = timer.Elapsed();
}

// SYCL will enqueue and run the kernel. Recall that the buffer's data is
// given back to the host at the end of scope.
// make sure data is given back to the host at the end of this scope
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
return 1;
}

// End timer.
double end = timer.Elapsed();

esimd_test::display_timing_stats(kernel_times, num_iters,
(end - start) * 1000);

writeHist(bins);
writeHist(cpuHistogram);
// Checking Histogram
Expand Down
Loading