Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit fff3672

Browse files
[SYCL][ESIMD] Print performance data for histogram tests in uniform way (#524)
* [SYCL][ESIMD] Output perfomance data for histogram test in uniform way Signed-off-by: Sergey Dmitriev <[email protected]> Co-authored-by: Vyacheslav Klochkov <[email protected]>
1 parent f531ccf commit fff3672

File tree

6 files changed

+241
-138
lines changed

6 files changed

+241
-138
lines changed

SYCL/ESIMD/esimd_test_utils.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,4 +190,12 @@ inline double report_time(const std::string &msg, event e0, event en) {
190190
return elapsed;
191191
}
192192

193+
void display_timing_stats(double const kernelTime,
194+
unsigned int const uiNumberOfIterations,
195+
double const overallTime) {
196+
std::cout << "Number of iterations: " << uiNumberOfIterations << "\n";
197+
std::cout << "[KernelTime]:" << kernelTime << "\n";
198+
std::cout << "[OverallTime][Primary]:" << overallTime << "\n";
199+
}
200+
193201
} // namespace esimd_test

SYCL/ESIMD/histogram.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,11 +132,16 @@ int main(int argc, char *argv[]) {
132132
image_channel_type::unsigned_int32,
133133
range<2>{width / sizeof(uint4), height});
134134

135+
// Start Timer
136+
esimd_test::Timer timer;
137+
double start;
138+
135139
// Launches the task on the GPU.
136140
double kernel_times = 0;
137141
unsigned num_iters = 10;
138142

139143
try {
144+
// num_iters + 1, iteration#0 is for warmup
140145
for (int iter = 0; iter <= num_iters; ++iter) {
141146
double etime = 0;
142147
for (int b = 0; b < NUM_BINS; b++)
@@ -215,17 +220,22 @@ int main(int argc, char *argv[]) {
215220
etime = esimd_test::report_time("kernel time", e, e);
216221
if (iter > 0)
217222
kernel_times += etime;
223+
else
224+
start = timer.Elapsed();
218225
}
219226
// SYCL will enqueue and run the kernel. Recall that the buffer's data is
220227
// given back to the host at the end of scope.
221228
// make sure data is given back to the host at the end of this scope
222229
} catch (cl::sycl::exception const &e) {
223230
std::cout << "SYCL exception caught: " << e.what() << '\n';
224-
return e.get_cl_code();
231+
return 1;
225232
}
226233

227-
float kernel_time = kernel_times / num_iters;
228-
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
234+
// End timer.
235+
double end = timer.Elapsed();
236+
237+
esimd_test::display_timing_stats(kernel_times, num_iters,
238+
(end - start) * 1000);
229239

230240
writeHist(bins);
231241
writeHist(cpuHistogram);

SYCL/ESIMD/histogram_256_slm.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,10 @@ int main() {
156156
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
157157
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
158158

159+
// Start Timer
160+
esimd_test::Timer timer;
161+
double start;
162+
159163
// Launches the task on the GPU.
160164
double kernel_times = 0;
161165
unsigned num_iters = 10;
@@ -174,14 +178,19 @@ int main() {
174178
etime = esimd_test::report_time("kernel time", e, e);
175179
if (iter > 0)
176180
kernel_times += etime;
181+
else
182+
start = timer.Elapsed();
177183
}
178184
} catch (cl::sycl::exception const &e) {
179185
std::cout << "SYCL exception caught: " << e.what() << '\n';
180-
return e.get_cl_code();
186+
return 1;
181187
}
182188

183-
float kernel_time = kernel_times / num_iters;
184-
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
189+
// End timer.
190+
double end = timer.Elapsed();
191+
192+
esimd_test::display_timing_stats(kernel_times, num_iters,
193+
(end - start) * 1000);
185194

186195
std::cout << "finish GPU histogram\n";
187196

SYCL/ESIMD/histogram_256_slm_spec_2020.cpp

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,8 @@ class NumBlocksConst;
101101
class histogram_slm;
102102

103103
int main(int argc, char **argv) {
104-
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
104+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
105+
property::queue::enable_profiling{});
105106
auto dev = q.get_device();
106107
auto ctxt = q.get_context();
107108

@@ -160,23 +161,46 @@ int main(int argc, char **argv) {
160161
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
161162
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
162163

164+
// Start Timer
165+
esimd_test::Timer timer;
166+
double start;
167+
168+
double kernel_times = 0;
169+
unsigned num_iters = 10;
163170
try {
164-
auto e = q.submit([&](cl::sycl::handler &cgh) {
165-
cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
166-
cgh.parallel_for<histogram_slm>(
167-
Range,
168-
[=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
169-
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
170-
ndi.get_local_id(0), 16,
171-
kh.get_specialization_constant<NumBlocksSpecId>());
172-
});
173-
});
174-
e.wait();
171+
// num_iters + 1, iteration#0 is for warmup
172+
for (int iter = 0; iter <= num_iters; ++iter) {
173+
double etime = 0;
174+
memset(output_surface, 0, 4 * NUM_BINS);
175+
auto e = q.submit([&](cl::sycl::handler &cgh) {
176+
cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
177+
cgh.parallel_for<histogram_slm>(
178+
Range,
179+
[=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
180+
histogram_atomic(
181+
input_ptr, output_surface, ndi.get_group(0),
182+
ndi.get_local_id(0), 16,
183+
kh.get_specialization_constant<NumBlocksSpecId>());
184+
});
185+
});
186+
e.wait();
187+
etime = esimd_test::report_time("kernel time", e, e);
188+
if (iter > 0)
189+
kernel_times += etime;
190+
else
191+
start = timer.Elapsed();
192+
}
175193
} catch (cl::sycl::exception const &e) {
176194
std::cout << "SYCL exception caught: " << e.what() << '\n';
177-
return e.get_cl_code();
195+
return 1;
178196
}
179197

198+
// End timer.
199+
double end = timer.Elapsed();
200+
201+
esimd_test::display_timing_stats(kernel_times, num_iters,
202+
(end - start) * 1000);
203+
180204
std::cout << "finish GPU histogram\n";
181205

182206
memcpy(hist, output_surface, 4 * NUM_BINS);

SYCL/ESIMD/histogram_2d.cpp

Lines changed: 85 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ int main(int argc, char *argv[]) {
7979
// Read in image luma plane
8080

8181
// Allocate Input Buffer
82-
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
82+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
83+
property::queue::enable_profiling{});
8384

8485
auto dev = q.get_device();
8586
auto ctxt = q.get_context();
@@ -135,84 +136,109 @@ int main(int argc, char *argv[]) {
135136
image_channel_type::unsigned_int32,
136137
range<2>{width / sizeof(uint4), height});
137138

138-
try {
139-
// create ranges
140-
// We need that many workitems
141-
auto GlobalRange = range<2>(range_width, range_height);
142-
// Number of workitems in a workgroup
143-
auto LocalRange = range<2>(1, 1);
144-
nd_range<2> Range(GlobalRange, LocalRange);
145-
146-
auto e = q.submit([&](handler &cgh) {
147-
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
148-
149-
cgh.parallel_for<class Hist>(
150-
Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
151-
using namespace sycl::ext::intel::experimental::esimd;
152-
153-
// Get thread origin offsets
154-
uint h_pos = ndi.get_group(0) * BLOCK_WIDTH;
155-
uint v_pos = ndi.get_group(1) * BLOCK_HEIGHT;
139+
// Start Timer
140+
esimd_test::Timer timer;
141+
double start;
156142

157-
// Declare a 8x32 uchar matrix to store the input block pixel value
158-
simd<unsigned char, 8 * 32> in;
159-
160-
// Declare a vector to store the local histogram
161-
simd<unsigned int, NUM_BINS> histogram(0);
162-
163-
// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
164-
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
165-
// Perform 2D media block read to load 8x32 pixel block
166-
in =
167-
media_block_load<unsigned char, 8, 32>(readAcc, h_pos, v_pos);
168-
169-
// Accumulate local histogram for each pixel value
143+
double kernel_times = 0;
144+
unsigned num_iters = 10;
145+
try {
146+
// num_iters + 1, iteration#0 is for warmup
147+
for (int iter = 0; iter <= num_iters; ++iter) {
148+
double etime = 0;
149+
for (int b = 0; b < NUM_BINS; b++)
150+
bins[b] = 0;
151+
// create ranges
152+
// We need that many workitems
153+
auto GlobalRange = range<2>(range_width, range_height);
154+
// Number of workitems in a workgroup
155+
auto LocalRange = range<2>(1, 1);
156+
nd_range<2> Range(GlobalRange, LocalRange);
157+
158+
auto e = q.submit([&](handler &cgh) {
159+
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
160+
161+
cgh.parallel_for<class Hist>(
162+
Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
163+
using namespace sycl::ext::intel::experimental::esimd;
164+
165+
// Get thread origin offsets
166+
uint h_pos = ndi.get_group(0) * BLOCK_WIDTH;
167+
uint v_pos = ndi.get_group(1) * BLOCK_HEIGHT;
168+
169+
// Declare a 8x32 uchar matrix to store the input block pixel
170+
// value
171+
simd<unsigned char, 8 * 32> in;
172+
173+
// Declare a vector to store the local histogram
174+
simd<unsigned int, NUM_BINS> histogram(0);
175+
176+
// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
177+
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
178+
// Perform 2D media block read to load 8x32 pixel block
179+
in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
180+
v_pos);
181+
182+
// Accumulate local histogram for each pixel value
170183
#pragma unroll
171-
for (int i = 0; i < 8; i++) {
184+
for (int i = 0; i < 8; i++) {
172185
#pragma unroll
173-
for (int j = 0; j < 32; j++) {
174-
histogram.select<1, 1>(in[i * 32 + j]) += 1;
186+
for (int j = 0; j < 32; j++) {
187+
histogram.select<1, 1>(in[i * 32 + j]) += 1;
188+
}
175189
}
176-
}
177190

178-
// Update starting offset for the next work block
179-
v_pos += 8;
180-
}
191+
// Update starting offset for the next work block
192+
v_pos += 8;
193+
}
181194

182-
// Declare a vector to store the offset for atomic write operation
183-
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
184-
offset *= sizeof(unsigned int);
195+
// Declare a vector to store the offset for atomic write operation
196+
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
197+
offset *= sizeof(unsigned int);
185198

186-
// Update global sum by atomically adding each local histogram
199+
// Update global sum by atomically adding each local histogram
187200
#pragma unroll
188-
for (int i = 0; i < NUM_BINS; i += 8) {
189-
// Declare a vector to store the source for atomic write operation
190-
simd<unsigned int, 8> src;
191-
src = histogram.select<8, 1>(i);
201+
for (int i = 0; i < NUM_BINS; i += 8) {
202+
// Declare a vector to store the source for atomic write
203+
// operation
204+
simd<unsigned int, 8> src;
205+
src = histogram.select<8, 1>(i);
192206

193207
#ifdef __SYCL_DEVICE_ONLY__
194-
flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
195-
1);
196-
offset += 8 * sizeof(unsigned int);
208+
flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
209+
1);
210+
offset += 8 * sizeof(unsigned int);
197211
#else
198-
simd<unsigned int, 8> vals;
199-
vals.copy_from(bins + i);
200-
vals = vals + src;
201-
vals.copy_to(bins + i);
212+
simd<unsigned int, 8> vals;
213+
vals.copy_from(bins + i);
214+
vals = vals + src;
215+
vals.copy_to(bins + i);
202216
#endif
203-
}
204-
});
205-
});
206-
e.wait();
217+
}
218+
});
219+
});
220+
e.wait();
221+
etime = esimd_test::report_time("kernel time", e, e);
222+
if (iter > 0)
223+
kernel_times += etime;
224+
else
225+
start = timer.Elapsed();
226+
}
207227

208228
// SYCL will enqueue and run the kernel. Recall that the buffer's data is
209229
// given back to the host at the end of scope.
210230
// make sure data is given back to the host at the end of this scope
211231
} catch (cl::sycl::exception const &e) {
212232
std::cout << "SYCL exception caught: " << e.what() << '\n';
213-
return e.get_cl_code();
233+
return 1;
214234
}
215235

236+
// End timer.
237+
double end = timer.Elapsed();
238+
239+
esimd_test::display_timing_stats(kernel_times, num_iters,
240+
(end - start) * 1000);
241+
216242
writeHist(bins);
217243
writeHist(cpuHistogram);
218244
// Checking Histogram

0 commit comments

Comments
 (0)