Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][ESIMD] Print performance data for histogram tests in uniform way #524

Merged
merged 6 commits into from
Oct 21, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions SYCL/ESIMD/esimd_test_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,4 +190,12 @@ inline double report_time(const std::string &msg, event e0, event en) {
return elapsed;
}

void display_timing_stats(double const kernelTime,
unsigned int const uiNumberOfIterations,
double const overallTime) {
std::cout << "Number of iterations: " << uiNumberOfIterations << "\n";
std::cout << "[KernelTime]:" << kernelTime << "\n";
std::cout << "[OverallTime][Primary]:" << overallTime << "\n";
}

} // namespace esimd_test
16 changes: 14 additions & 2 deletions SYCL/ESIMD/histogram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,16 @@ int main(int argc, char *argv[]) {
image_channel_type::unsigned_int32,
range<2>{width / sizeof(uint4), height});

// Start Timer
esimd_test::Timer timer;
double start;

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;

try {
// num_iters + 1, iteration#0 is for warmup
for (int iter = 0; iter <= num_iters; ++iter) {
double etime = 0;
for (int b = 0; b < NUM_BINS; b++)
Expand Down Expand Up @@ -215,6 +220,8 @@ int main(int argc, char *argv[]) {
etime = esimd_test::report_time("kernel time", e, e);
if (iter > 0)
kernel_times += etime;
else
start = timer.Elapsed();
}
// SYCL will enqueue and run the kernel. Recall that the buffer's data is
// given back to the host at the end of scope.
Expand All @@ -224,8 +231,13 @@ int main(int argc, char *argv[]) {
return e.get_cl_code();
}

float kernel_time = kernel_times / num_iters;
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
// End timer.
double end = timer.Elapsed();

esimd_test::display_timing_stats(kernel_times, num_iters,
(end - start) * 1000);
// float kernel_time = kernel_times / num_iters;
// std::cerr << "GPU kernel time = " << kernel_time << " msec\n";

writeHist(bins);
writeHist(cpuHistogram);
Expand Down
15 changes: 13 additions & 2 deletions SYCL/ESIMD/histogram_256_slm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ int main() {
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);

// Start Timer
esimd_test::Timer timer;
double start;

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;
Expand All @@ -174,14 +178,21 @@ int main() {
etime = esimd_test::report_time("kernel time", e, e);
if (iter > 0)
kernel_times += etime;
else
start = timer.Elapsed();
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
}

float kernel_time = kernel_times / num_iters;
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
// End timer.
double end = timer.Elapsed();

esimd_test::display_timing_stats(kernel_times, num_iters,
(end - start) * 1000);
// float kernel_time = kernel_times / num_iters;
// std::cerr << "GPU kernel time = " << kernel_time << " msec\n";

std::cout << "finish GPU histogram\n";

Expand Down
45 changes: 34 additions & 11 deletions SYCL/ESIMD/histogram_256_slm_spec_2020.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,23 +160,46 @@ int main(int argc, char **argv) {
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);

// Start Timer
esimd_test::Timer timer;
double start;

double kernel_times = 0;
unsigned num_iters = 10;
try {
auto e = q.submit([&](cl::sycl::handler &cgh) {
cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
cgh.parallel_for<histogram_slm>(
Range,
[=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
ndi.get_local_id(0), 16,
kh.get_specialization_constant<NumBlocksSpecId>());
});
});
e.wait();
// num_iters + 1, iteration#0 is for warmup
for (int iter = 0; iter <= num_iters; ++iter) {
double etime = 0;
memset(output_surface, 0, 4 * NUM_BINS);
auto e = q.submit([&](cl::sycl::handler &cgh) {
cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
cgh.parallel_for<histogram_slm>(
Range,
[=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
histogram_atomic(
input_ptr, output_surface, ndi.get_group(0),
ndi.get_local_id(0), 16,
kh.get_specialization_constant<NumBlocksSpecId>());
});
});
e.wait();
etime = esimd_test::report_time("kernel time", e, e);
if (iter > 0)
kernel_times += etime;
else
start = timer.Elapsed();
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
}

// End timer.
double end = timer.Elapsed();

esimd_test::display_timing_stats(kernel_times, num_iters,
(end - start) * 1000);

std::cout << "finish GPU histogram\n";

memcpy(hist, output_surface, 4 * NUM_BINS);
Expand Down
139 changes: 82 additions & 57 deletions SYCL/ESIMD/histogram_2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,75 +135,94 @@ int main(int argc, char *argv[]) {
image_channel_type::unsigned_int32,
range<2>{width / sizeof(uint4), height});

try {
// create ranges
// We need that many workitems
auto GlobalRange = range<2>(range_width, range_height);
// Number of workitems in a workgroup
auto LocalRange = range<2>(1, 1);
nd_range<2> Range(GlobalRange, LocalRange);

auto e = q.submit([&](handler &cgh) {
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);

cgh.parallel_for<class Hist>(
Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;

// Get thread origin offsets
uint h_pos = ndi.get_group(0) * BLOCK_WIDTH;
uint v_pos = ndi.get_group(1) * BLOCK_HEIGHT;

// Declare a 8x32 uchar matrix to store the input block pixel value
simd<unsigned char, 8 * 32> in;

// Declare a vector to store the local histogram
simd<unsigned int, NUM_BINS> histogram(0);

// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
// Perform 2D media block read to load 8x32 pixel block
in =
media_block_load<unsigned char, 8, 32>(readAcc, h_pos, v_pos);
// Start Timer
esimd_test::Timer timer;
double start;

// Accumulate local histogram for each pixel value
double kernel_times = 0;
unsigned num_iters = 10;
try {
// num_iters + 1, iteration#0 is for warmup
for (int iter = 0; iter <= num_iters; ++iter) {
double etime = 0;
for (int b = 0; b < NUM_BINS; b++)
bins[b] = 0;
// create ranges
// We need that many workitems
auto GlobalRange = range<2>(range_width, range_height);
// Number of workitems in a workgroup
auto LocalRange = range<2>(1, 1);
nd_range<2> Range(GlobalRange, LocalRange);

auto e = q.submit([&](handler &cgh) {
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);

cgh.parallel_for<class Hist>(
Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;

// Get thread origin offsets
uint h_pos = ndi.get_group(0) * BLOCK_WIDTH;
uint v_pos = ndi.get_group(1) * BLOCK_HEIGHT;

// Declare a 8x32 uchar matrix to store the input block pixel
// value
simd<unsigned char, 8 * 32> in;

// Declare a vector to store the local histogram
simd<unsigned int, NUM_BINS> histogram(0);

// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
// Perform 2D media block read to load 8x32 pixel block
in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
v_pos);

// Accumulate local histogram for each pixel value
#pragma unroll
for (int i = 0; i < 8; i++) {
for (int i = 0; i < 8; i++) {
#pragma unroll
for (int j = 0; j < 32; j++) {
histogram.select<1, 1>(in[i * 32 + j]) += 1;
for (int j = 0; j < 32; j++) {
histogram.select<1, 1>(in[i * 32 + j]) += 1;
}
}
}

// Update starting offset for the next work block
v_pos += 8;
}
// Update starting offset for the next work block
v_pos += 8;
}

// Declare a vector to store the offset for atomic write operation
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
offset *= sizeof(unsigned int);
// Declare a vector to store the offset for atomic write operation
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
offset *= sizeof(unsigned int);

// Update global sum by atomically adding each local histogram
// Update global sum by atomically adding each local histogram
#pragma unroll
for (int i = 0; i < NUM_BINS; i += 8) {
// Declare a vector to store the source for atomic write operation
simd<unsigned int, 8> src;
src = histogram.select<8, 1>(i);
for (int i = 0; i < NUM_BINS; i += 8) {
// Declare a vector to store the source for atomic write
// operation
simd<unsigned int, 8> src;
src = histogram.select<8, 1>(i);

#ifdef __SYCL_DEVICE_ONLY__
flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
1);
offset += 8 * sizeof(unsigned int);
flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
1);
offset += 8 * sizeof(unsigned int);
#else
simd<unsigned int, 8> vals;
vals.copy_from(bins + i);
vals = vals + src;
vals.copy_to(bins + i);
simd<unsigned int, 8> vals;
vals.copy_from(bins + i);
vals = vals + src;
vals.copy_to(bins + i);
#endif
}
});
});
e.wait();
}
});
});
e.wait();
etime = esimd_test::report_time("kernel time", e, e);
if (iter > 0)
kernel_times += etime;
else
start = timer.Elapsed();
}

// SYCL will enqueue and run the kernel. Recall that the buffer's data is
// given back to the host at the end of scope.
Expand All @@ -213,6 +232,12 @@ int main(int argc, char *argv[]) {
return e.get_cl_code();
}

// End timer.
double end = timer.Elapsed();

esimd_test::display_timing_stats(kernel_times, num_iters,
(end - start) * 1000);

writeHist(bins);
writeHist(cpuHistogram);
// Checking Histogram
Expand Down
Loading