Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit b9ae747

Browse files
author
Gang Y Chen
committed
[SYCL][ESIMD] refactor timing code
Signed-off-by: Gang Y Chen <[email protected]>
1 parent 8b83ab3 commit b9ae747

9 files changed

+159
-277
lines changed

SYCL/ESIMD/BitonicSortK.cpp

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -506,16 +506,6 @@ ESIMD_INLINE void cmk_bitonic_merge(AccTy buf, uint32_t n, uint32_t m,
506506
}
507507
}
508508

509-
static double report_time(const string &msg, event e0, event en) {
510-
cl_ulong time_start =
511-
e0.get_profiling_info<info::event_profiling::command_start>();
512-
cl_ulong time_end =
513-
en.get_profiling_info<info::event_profiling::command_end>();
514-
double elapsed = (time_end - time_start) / 1e6;
515-
cout << msg << elapsed << " milliseconds" << std::endl;
516-
return elapsed;
517-
}
518-
519509
struct BitonicSort {
520510
enum {
521511
base_sort_size_ = 256,
@@ -615,7 +605,7 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
615605
});
616606
});
617607
e.wait();
618-
total_time += report_time("kernel time", e, e);
608+
total_time += esimd_test::report_time("kernel time", e, e);
619609
} catch (cl::sycl::exception const &e) {
620610
std::cout << "SYCL exception caught: " << e.what() << '\n';
621611
return e.get_cl_code();
@@ -663,7 +653,8 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
663653
}
664654

665655
mergeEvent[k - 1].wait();
666-
total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
656+
total_time +=
657+
esimd_test::report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
667658

668659
cout << " Sorting Time = " << total_time << " msec " << std::endl;
669660
return 1;

SYCL/ESIMD/BitonicSortKv2.cpp

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -421,16 +421,6 @@ ESIMD_INLINE void cmk_bitonic_merge(uint32_t *buf, uint32_t n, uint32_t m,
421421
}
422422
}
423423

424-
static double report_time(const string &msg, event e0, event en) {
425-
cl_ulong time_start =
426-
e0.get_profiling_info<info::event_profiling::command_start>();
427-
cl_ulong time_end =
428-
en.get_profiling_info<info::event_profiling::command_end>();
429-
double elapsed = (time_end - time_start) / 1e6;
430-
cout << msg << elapsed << " milliseconds" << std::endl;
431-
return elapsed;
432-
}
433-
434424
struct BitonicSort {
435425
enum {
436426
base_sort_size_ = 256,
@@ -532,7 +522,7 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
532522
});
533523
});
534524
e.wait();
535-
total_time += report_time("kernel time", e, e);
525+
total_time += esimd_test::report_time("kernel time", e, e);
536526
} catch (cl::sycl::exception const &e) {
537527
std::cout << "SYCL exception caught: " << e.what() << '\n';
538528
return e.get_cl_code();
@@ -580,7 +570,8 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
580570
}
581571

582572
mergeEvent[k - 1].wait();
583-
total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
573+
total_time +=
574+
esimd_test::report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
584575

585576
cout << " Sorting Time = " << total_time << " msec " << std::endl;
586577
return 1;

SYCL/ESIMD/esimd_test_utils.hpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#define NOMINMAX
1212

1313
#include <algorithm>
14+
#include <chrono>
1415
#include <cstring>
1516
#include <fstream>
1617
#include <iostream>
@@ -161,4 +162,32 @@ To bit_cast(const From &src) noexcept {
161162
return dst;
162163
}
163164

165+
// Timer class for measuring elasped time
166+
class Timer {
167+
public:
168+
Timer() : start_(std::chrono::steady_clock::now()) {}
169+
170+
double Elapsed() {
171+
auto now = std::chrono::steady_clock::now();
172+
return std::chrono::duration_cast<Duration>(now - start_).count();
173+
}
174+
175+
private:
176+
using Duration = std::chrono::duration<double>;
177+
std::chrono::steady_clock::time_point start_;
178+
};
179+
180+
// e0 is the first event, en is the last event
181+
// find the time difference between the starting time of the e0 and
182+
// the ending time of en, return micro-second
183+
inline double report_time(const std::string &msg, event e0, event en) {
184+
cl_ulong time_start =
185+
e0.get_profiling_info<info::event_profiling::command_start>();
186+
cl_ulong time_end =
187+
en.get_profiling_info<info::event_profiling::command_end>();
188+
double elapsed = (time_end - time_start) / 1e6;
189+
// cerr << msg << elapsed << " msecs" << std::endl;
190+
return elapsed;
191+
}
192+
164193
} // namespace esimd_test

SYCL/ESIMD/histogram.cpp

Lines changed: 76 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,11 @@ void histogram_CPU(unsigned int width, unsigned int height, unsigned char *srcY,
4141
void writeHist(unsigned int *hist) {
4242
int total = 0;
4343

44-
std::cerr << "\nHistogram: \n";
44+
// std::cerr << "\nHistogram: \n";
4545
for (int i = 0; i < NUM_BINS; i += 8) {
46-
std::cerr << "\n [" << i << " - " << i + 7 << "]:";
46+
// std::cerr << "\n [" << i << " - " << i + 7 << "]:";
4747
for (int j = 0; j < 8; j++) {
48-
std::cerr << "\t" << hist[i + j];
48+
// std::cerr << "\t" << hist[i + j];
4949
total += hist[i + j];
5050
}
5151
}
@@ -80,7 +80,8 @@ int main(int argc, char *argv[]) {
8080
// Read in image luma plane
8181

8282
// Allocate Input Buffer
83-
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
83+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
84+
property::queue::enable_profiling{});
8485

8586
auto dev = q.get_device();
8687
auto ctxt = q.get_context();
@@ -121,10 +122,6 @@ int main(int argc, char *argv[]) {
121122
}
122123
}
123124

124-
for (int i = 0; i < NUM_BINS; i++) {
125-
bins[i] = 0;
126-
}
127-
128125
// ------------------------------------------------------------------------
129126
// CPU Execution:
130127

@@ -136,76 +133,89 @@ int main(int argc, char *argv[]) {
136133
image_channel_type::unsigned_int32,
137134
range<2>{width / sizeof(uint4), height});
138135

136+
// Launches the task on the GPU.
137+
double kernel_times = 0;
138+
unsigned num_iters = 10;
139+
139140
try {
140-
// create ranges
141-
// We need that many workitems
142-
auto GlobalRange = range<1>(range_width * range_height);
143-
// Number of workitems in a workgroup
144-
auto LocalRange = range<1>(1);
145-
nd_range<1> Range(GlobalRange, LocalRange);
146-
147-
auto e = q.submit([&](handler &cgh) {
148-
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
149-
150-
cgh.parallel_for<class Hist>(
151-
Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
152-
using namespace sycl::INTEL::gpu;
153-
154-
// Get thread origin offsets
155-
uint tid = ndi.get_group(0);
156-
uint h_pos = (tid % range_width) * BLOCK_WIDTH;
157-
uint v_pos = (tid / range_width) * BLOCK_HEIGHT;
158-
159-
// Declare a 8x32 uchar matrix to store the input block pixel value
160-
simd<unsigned char, 8 * 32> in;
161-
162-
// Declare a vector to store the local histogram
163-
simd<unsigned int, NUM_BINS> histogram(0);
164-
165-
// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
166-
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
167-
// Perform 2D media block read to load 8x32 pixel block
168-
in =
169-
media_block_load<unsigned char, 8, 32>(readAcc, h_pos, v_pos);
170-
171-
// Accumulate local histogram for each pixel value
141+
for (int iter = 0; iter <= num_iters; ++iter) {
142+
double etime = 0;
143+
for (int b = 0; b < NUM_BINS; b++)
144+
bins[b] = 0;
145+
// create ranges
146+
// We need that many workitems
147+
auto GlobalRange = range<1>(range_width * range_height);
148+
// Number of workitems in a workgroup
149+
auto LocalRange = range<1>(1);
150+
nd_range<1> Range(GlobalRange, LocalRange);
151+
152+
auto e = q.submit([&](handler &cgh) {
153+
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
154+
155+
cgh.parallel_for<class Hist>(
156+
Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
157+
using namespace sycl::INTEL::gpu;
158+
159+
// Get thread origin offsets
160+
uint tid = ndi.get_group(0);
161+
uint h_pos = (tid % range_width) * BLOCK_WIDTH;
162+
uint v_pos = (tid / range_width) * BLOCK_HEIGHT;
163+
164+
// Declare a 8x32 uchar matrix to store the input block pixel
165+
// value
166+
simd<unsigned char, 8 * 32> in;
167+
168+
// Declare a vector to store the local histogram
169+
simd<unsigned int, NUM_BINS> histogram(0);
170+
171+
// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
172+
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
173+
// Perform 2D media block read to load 8x32 pixel block
174+
in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
175+
v_pos);
176+
177+
// Accumulate local histogram for each pixel value
172178
#pragma unroll
173-
for (int i = 0; i < 8; i++) {
179+
for (int i = 0; i < 8; i++) {
174180
#pragma unroll
175-
for (int j = 0; j < 32; j++) {
176-
histogram.select<1, 1>(in[i * 32 + j]) += 1;
181+
for (int j = 0; j < 32; j++) {
182+
histogram.select<1, 1>(in[i * 32 + j]) += 1;
183+
}
177184
}
178-
}
179185

180-
// Update starting offset for the next work block
181-
v_pos += 8;
182-
}
186+
// Update starting offset for the next work block
187+
v_pos += 8;
188+
}
183189

184-
// Declare a vector to store the offset for atomic write operation
185-
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
186-
offset *= sizeof(unsigned int);
190+
// Declare a vector to store the offset for atomic write operation
191+
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
192+
offset *= sizeof(unsigned int);
187193

188-
// Update global sum by atomically adding each local histogram
194+
// Update global sum by atomically adding each local histogram
189195
#pragma unroll
190-
for (int i = 0; i < NUM_BINS; i += 8) {
191-
// Declare a vector to store the source for atomic write operation
192-
simd<unsigned int, 8> src;
193-
src = histogram.select<8, 1>(i);
196+
for (int i = 0; i < NUM_BINS; i += 8) {
197+
// Declare a vector to store the source for atomic write
198+
// operation
199+
simd<unsigned int, 8> src;
200+
src = histogram.select<8, 1>(i);
194201

195202
#ifdef __SYCL_DEVICE_ONLY__
196-
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>(
197-
bins, offset, src, 1);
198-
offset += 8 * sizeof(unsigned int);
203+
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>(
204+
bins, offset, src, 1);
205+
offset += 8 * sizeof(unsigned int);
199206
#else
200207
auto vals = block_load<unsigned int, 8>(bins + i);
201208
vals = vals + src;
202209
block_store<unsigned int, 8>(bins + i, vals);
203210
#endif
204-
}
205-
});
206-
});
207-
e.wait();
208-
211+
}
212+
});
213+
});
214+
e.wait();
215+
etime = esimd_test::report_time("kernel time", e, e);
216+
if (iter > 0)
217+
kernel_times += etime;
218+
}
209219
// SYCL will enqueue and run the kernel. Recall that the buffer's data is
210220
// given back to the host at the end of scope.
211221
// make sure data is given back to the host at the end of this scope
@@ -214,6 +224,9 @@ int main(int argc, char *argv[]) {
214224
return e.get_cl_code();
215225
}
216226

227+
float kernel_time = kernel_times / num_iters;
228+
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
229+
217230
writeHist(bins);
218231
writeHist(cpuHistogram);
219232
// Checking Histogram

SYCL/ESIMD/histogram_256_slm.cpp

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,8 @@ int CheckHistogram(unsigned int *cpu_histogram, unsigned int *gpu_histogram) {
102102
}
103103

104104
int main() {
105-
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
105+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
106+
property::queue::enable_profiling{});
106107
auto dev = q.get_device();
107108
auto ctxt = q.get_context();
108109

@@ -146,7 +147,6 @@ int main() {
146147
// Uses the GPU to calculate the histogram output data.
147148
unsigned int *output_surface =
148149
(uint32_t *)malloc_shared(4 * NUM_BINS, dev, ctxt);
149-
memset(output_surface, 0, 4 * NUM_BINS);
150150

151151
unsigned int num_threads;
152152
num_threads = width * height / (NUM_BLOCKS * BLOCK_WIDTH * sizeof(int));
@@ -155,20 +155,33 @@ int main() {
155155
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
156156
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
157157

158+
// Launches the task on the GPU.
159+
double kernel_times = 0;
160+
unsigned num_iters = 10;
158161
try {
159-
auto e = q.submit([&](cl::sycl::handler &cgh) {
160-
cgh.parallel_for<class histogram_slm>(
161-
Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
162-
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
163-
ndi.get_local_id(0), 16);
164-
});
165-
});
166-
e.wait();
162+
for (int iter = 0; iter <= num_iters; ++iter) {
163+
double etime = 0;
164+
memset(output_surface, 0, 4 * NUM_BINS);
165+
auto e = q.submit([&](cl::sycl::handler &cgh) {
166+
cgh.parallel_for<class histogram_slm>(
167+
Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
168+
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
169+
ndi.get_local_id(0), 16);
170+
});
171+
});
172+
e.wait();
173+
etime = esimd_test::report_time("kernel time", e, e);
174+
if (iter > 0)
175+
kernel_times += etime;
176+
}
167177
} catch (cl::sycl::exception const &e) {
168178
std::cout << "SYCL exception caught: " << e.what() << '\n';
169179
return e.get_cl_code();
170180
}
171181

182+
float kernel_time = kernel_times / num_iters;
183+
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
184+
172185
std::cout << "finish GPU histogram\n";
173186

174187
memcpy(hist, output_surface, 4 * NUM_BINS);

0 commit comments

Comments
 (0)