Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit dd28d9c

Browse files
committed
[SYCL][ESIMD] Output perfomance data for histogram test in uniform way
Signed-off-by: Sergey Dmitriev <[email protected]>
1 parent 14e781e commit dd28d9c

File tree

6 files changed

+234
-130
lines changed

6 files changed

+234
-130
lines changed

SYCL/ESIMD/esimd_test_utils.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,4 +190,12 @@ inline double report_time(const std::string &msg, event e0, event en) {
190190
return elapsed;
191191
}
192192

193+
void display_timing_stats(double const kernelTime,
194+
unsigned int const uiNumberOfIterations,
195+
double const overallTime) {
196+
std::cout << "Number of iterations: " << uiNumberOfIterations << "\n";
197+
std::cout << "[KernelTime]:" << kernelTime << "\n";
198+
std::cout << "[OverallTime][Primary]:" << overallTime << "\n";
199+
}
200+
193201
} // namespace esimd_test

SYCL/ESIMD/histogram.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,11 +132,16 @@ int main(int argc, char *argv[]) {
132132
image_channel_type::unsigned_int32,
133133
range<2>{width / sizeof(uint4), height});
134134

135+
// Start Timer
136+
esimd_test::Timer timer;
137+
double start;
138+
135139
// Launches the task on the GPU.
136140
double kernel_times = 0;
137141
unsigned num_iters = 10;
138142

139143
try {
144+
// num_iters + 1, iteration#0 is for warmup
140145
for (int iter = 0; iter <= num_iters; ++iter) {
141146
double etime = 0;
142147
for (int b = 0; b < NUM_BINS; b++)
@@ -215,6 +220,8 @@ int main(int argc, char *argv[]) {
215220
etime = esimd_test::report_time("kernel time", e, e);
216221
if (iter > 0)
217222
kernel_times += etime;
223+
else
224+
start = timer.Elapsed();
218225
}
219226
// SYCL will enqueue and run the kernel. Recall that the buffer's data is
220227
// given back to the host at the end of scope.
@@ -224,8 +231,13 @@ int main(int argc, char *argv[]) {
224231
return e.get_cl_code();
225232
}
226233

227-
float kernel_time = kernel_times / num_iters;
228-
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
234+
// End timer.
235+
double end = timer.Elapsed();
236+
237+
esimd_test::display_timing_stats(kernel_times, num_iters,
238+
(end - start) * 1000);
239+
// float kernel_time = kernel_times / num_iters;
240+
// std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
229241

230242
writeHist(bins);
231243
writeHist(cpuHistogram);

SYCL/ESIMD/histogram_256_slm.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,10 @@ int main() {
156156
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
157157
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
158158

159+
// Start Timer
160+
esimd_test::Timer timer;
161+
double start;
162+
159163
// Launches the task on the GPU.
160164
double kernel_times = 0;
161165
unsigned num_iters = 10;
@@ -174,14 +178,21 @@ int main() {
174178
etime = esimd_test::report_time("kernel time", e, e);
175179
if (iter > 0)
176180
kernel_times += etime;
181+
else
182+
start = timer.Elapsed();
177183
}
178184
} catch (cl::sycl::exception const &e) {
179185
std::cout << "SYCL exception caught: " << e.what() << '\n';
180186
return e.get_cl_code();
181187
}
182188

183-
float kernel_time = kernel_times / num_iters;
184-
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
189+
// End timer.
190+
double end = timer.Elapsed();
191+
192+
esimd_test::display_timing_stats(kernel_times, num_iters,
193+
(end - start) * 1000);
194+
// float kernel_time = kernel_times / num_iters;
195+
// std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
185196

186197
std::cout << "finish GPU histogram\n";
187198

SYCL/ESIMD/histogram_256_slm_spec_2020.cpp

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -160,23 +160,46 @@ int main(int argc, char **argv) {
160160
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
161161
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
162162

163+
// Start Timer
164+
esimd_test::Timer timer;
165+
double start;
166+
167+
double kernel_times = 0;
168+
unsigned num_iters = 10;
163169
try {
164-
auto e = q.submit([&](cl::sycl::handler &cgh) {
165-
cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
166-
cgh.parallel_for<histogram_slm>(
167-
Range,
168-
[=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
169-
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
170-
ndi.get_local_id(0), 16,
171-
kh.get_specialization_constant<NumBlocksSpecId>());
172-
});
173-
});
174-
e.wait();
170+
// num_iters + 1, iteration#0 is for warmup
171+
for (int iter = 0; iter <= num_iters; ++iter) {
172+
double etime = 0;
173+
memset(output_surface, 0, 4 * NUM_BINS);
174+
auto e = q.submit([&](cl::sycl::handler &cgh) {
175+
cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
176+
cgh.parallel_for<histogram_slm>(
177+
Range,
178+
[=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
179+
histogram_atomic(
180+
input_ptr, output_surface, ndi.get_group(0),
181+
ndi.get_local_id(0), 16,
182+
kh.get_specialization_constant<NumBlocksSpecId>());
183+
});
184+
});
185+
e.wait();
186+
etime = esimd_test::report_time("kernel time", e, e);
187+
if (iter > 0)
188+
kernel_times += etime;
189+
else
190+
start = timer.Elapsed();
191+
}
175192
} catch (cl::sycl::exception const &e) {
176193
std::cout << "SYCL exception caught: " << e.what() << '\n';
177194
return e.get_cl_code();
178195
}
179196

197+
// End timer.
198+
double end = timer.Elapsed();
199+
200+
esimd_test::display_timing_stats(kernel_times, num_iters,
201+
(end - start) * 1000);
202+
180203
std::cout << "finish GPU histogram\n";
181204

182205
memcpy(hist, output_surface, 4 * NUM_BINS);

SYCL/ESIMD/histogram_2d.cpp

Lines changed: 82 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -135,75 +135,94 @@ int main(int argc, char *argv[]) {
135135
image_channel_type::unsigned_int32,
136136
range<2>{width / sizeof(uint4), height});
137137

138-
try {
139-
// create ranges
140-
// We need that many workitems
141-
auto GlobalRange = range<2>(range_width, range_height);
142-
// Number of workitems in a workgroup
143-
auto LocalRange = range<2>(1, 1);
144-
nd_range<2> Range(GlobalRange, LocalRange);
145-
146-
auto e = q.submit([&](handler &cgh) {
147-
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
148-
149-
cgh.parallel_for<class Hist>(
150-
Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
151-
using namespace sycl::ext::intel::experimental::esimd;
152-
153-
// Get thread origin offsets
154-
uint h_pos = ndi.get_group(0) * BLOCK_WIDTH;
155-
uint v_pos = ndi.get_group(1) * BLOCK_HEIGHT;
156-
157-
// Declare a 8x32 uchar matrix to store the input block pixel value
158-
simd<unsigned char, 8 * 32> in;
159-
160-
// Declare a vector to store the local histogram
161-
simd<unsigned int, NUM_BINS> histogram(0);
162-
163-
// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
164-
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
165-
// Perform 2D media block read to load 8x32 pixel block
166-
in =
167-
media_block_load<unsigned char, 8, 32>(readAcc, h_pos, v_pos);
138+
// Start Timer
139+
esimd_test::Timer timer;
140+
double start;
168141

169-
// Accumulate local histogram for each pixel value
142+
double kernel_times = 0;
143+
unsigned num_iters = 10;
144+
try {
145+
// num_iters + 1, iteration#0 is for warmup
146+
for (int iter = 0; iter <= num_iters; ++iter) {
147+
double etime = 0;
148+
for (int b = 0; b < NUM_BINS; b++)
149+
bins[b] = 0;
150+
// create ranges
151+
// We need that many workitems
152+
auto GlobalRange = range<2>(range_width, range_height);
153+
// Number of workitems in a workgroup
154+
auto LocalRange = range<2>(1, 1);
155+
nd_range<2> Range(GlobalRange, LocalRange);
156+
157+
auto e = q.submit([&](handler &cgh) {
158+
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
159+
160+
cgh.parallel_for<class Hist>(
161+
Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
162+
using namespace sycl::ext::intel::experimental::esimd;
163+
164+
// Get thread origin offsets
165+
uint h_pos = ndi.get_group(0) * BLOCK_WIDTH;
166+
uint v_pos = ndi.get_group(1) * BLOCK_HEIGHT;
167+
168+
// Declare a 8x32 uchar matrix to store the input block pixel
169+
// value
170+
simd<unsigned char, 8 * 32> in;
171+
172+
// Declare a vector to store the local histogram
173+
simd<unsigned int, NUM_BINS> histogram(0);
174+
175+
// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
176+
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
177+
// Perform 2D media block read to load 8x32 pixel block
178+
in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
179+
v_pos);
180+
181+
// Accumulate local histogram for each pixel value
170182
#pragma unroll
171-
for (int i = 0; i < 8; i++) {
183+
for (int i = 0; i < 8; i++) {
172184
#pragma unroll
173-
for (int j = 0; j < 32; j++) {
174-
histogram.select<1, 1>(in[i * 32 + j]) += 1;
185+
for (int j = 0; j < 32; j++) {
186+
histogram.select<1, 1>(in[i * 32 + j]) += 1;
187+
}
175188
}
176-
}
177189

178-
// Update starting offset for the next work block
179-
v_pos += 8;
180-
}
190+
// Update starting offset for the next work block
191+
v_pos += 8;
192+
}
181193

182-
// Declare a vector to store the offset for atomic write operation
183-
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
184-
offset *= sizeof(unsigned int);
194+
// Declare a vector to store the offset for atomic write operation
195+
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
196+
offset *= sizeof(unsigned int);
185197

186-
// Update global sum by atomically adding each local histogram
198+
// Update global sum by atomically adding each local histogram
187199
#pragma unroll
188-
for (int i = 0; i < NUM_BINS; i += 8) {
189-
// Declare a vector to store the source for atomic write operation
190-
simd<unsigned int, 8> src;
191-
src = histogram.select<8, 1>(i);
200+
for (int i = 0; i < NUM_BINS; i += 8) {
201+
// Declare a vector to store the source for atomic write
202+
// operation
203+
simd<unsigned int, 8> src;
204+
src = histogram.select<8, 1>(i);
192205

193206
#ifdef __SYCL_DEVICE_ONLY__
194-
flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
195-
1);
196-
offset += 8 * sizeof(unsigned int);
207+
flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
208+
1);
209+
offset += 8 * sizeof(unsigned int);
197210
#else
198-
simd<unsigned int, 8> vals;
199-
vals.copy_from(bins + i);
200-
vals = vals + src;
201-
vals.copy_to(bins + i);
211+
simd<unsigned int, 8> vals;
212+
vals.copy_from(bins + i);
213+
vals = vals + src;
214+
vals.copy_to(bins + i);
202215
#endif
203-
}
204-
});
205-
});
206-
e.wait();
216+
}
217+
});
218+
});
219+
e.wait();
220+
etime = esimd_test::report_time("kernel time", e, e);
221+
if (iter > 0)
222+
kernel_times += etime;
223+
else
224+
start = timer.Elapsed();
225+
}
207226

208227
// SYCL will enqueue and run the kernel. Recall that the buffer's data is
209228
// given back to the host at the end of scope.
@@ -213,6 +232,12 @@ int main(int argc, char *argv[]) {
213232
return e.get_cl_code();
214233
}
215234

235+
// End timer.
236+
double end = timer.Elapsed();
237+
238+
esimd_test::display_timing_stats(kernel_times, num_iters,
239+
(end - start) * 1000);
240+
216241
writeHist(bins);
217242
writeHist(cpuHistogram);
218243
// Checking Histogram

0 commit comments

Comments
 (0)