@@ -79,7 +79,8 @@ int main(int argc, char *argv[]) {
79
79
// Read in image luma plane
80
80
81
81
// Allocate Input Buffer
82
- queue q (esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler ());
82
+ queue q (esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler (),
83
+ property::queue::enable_profiling{});
83
84
84
85
auto dev = q.get_device ();
85
86
auto ctxt = q.get_context ();
@@ -135,84 +136,109 @@ int main(int argc, char *argv[]) {
135
136
image_channel_type::unsigned_int32,
136
137
range<2 >{width / sizeof (uint4), height});
137
138
138
- try {
139
- // create ranges
140
- // We need that many workitems
141
- auto GlobalRange = range<2 >(range_width, range_height);
142
- // Number of workitems in a workgroup
143
- auto LocalRange = range<2 >(1 , 1 );
144
- nd_range<2 > Range (GlobalRange, LocalRange);
145
-
146
- auto e = q.submit ([&](handler &cgh) {
147
- auto readAcc = Img.get_access <uint4, cl::sycl::access::mode::read>(cgh);
148
-
149
- cgh.parallel_for <class Hist >(
150
- Range, [=](nd_item<2 > ndi) SYCL_ESIMD_KERNEL {
151
- using namespace sycl ::ext::intel::experimental::esimd;
152
-
153
- // Get thread origin offsets
154
- uint h_pos = ndi.get_group (0 ) * BLOCK_WIDTH;
155
- uint v_pos = ndi.get_group (1 ) * BLOCK_HEIGHT;
139
+ // Start Timer
140
+ esimd_test::Timer timer;
141
+ double start;
156
142
157
- // Declare a 8x32 uchar matrix to store the input block pixel value
158
- simd<unsigned char , 8 * 32 > in;
159
-
160
- // Declare a vector to store the local histogram
161
- simd<unsigned int , NUM_BINS> histogram (0 );
162
-
163
- // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
164
- for (int y = 0 ; y < BLOCK_HEIGHT / 8 ; y++) {
165
- // Perform 2D media block read to load 8x32 pixel block
166
- in =
167
- media_block_load<unsigned char , 8 , 32 >(readAcc, h_pos, v_pos);
168
-
169
- // Accumulate local histogram for each pixel value
143
+ double kernel_times = 0 ;
144
+ unsigned num_iters = 10 ;
145
+ try {
146
+ // num_iters + 1, iteration#0 is for warmup
147
+ for (int iter = 0 ; iter <= num_iters; ++iter) {
148
+ double etime = 0 ;
149
+ for (int b = 0 ; b < NUM_BINS; b++)
150
+ bins[b] = 0 ;
151
+ // create ranges
152
+ // We need that many workitems
153
+ auto GlobalRange = range<2 >(range_width, range_height);
154
+ // Number of workitems in a workgroup
155
+ auto LocalRange = range<2 >(1 , 1 );
156
+ nd_range<2 > Range (GlobalRange, LocalRange);
157
+
158
+ auto e = q.submit ([&](handler &cgh) {
159
+ auto readAcc = Img.get_access <uint4, cl::sycl::access::mode::read>(cgh);
160
+
161
+ cgh.parallel_for <class Hist >(
162
+ Range, [=](nd_item<2 > ndi) SYCL_ESIMD_KERNEL {
163
+ using namespace sycl ::ext::intel::experimental::esimd;
164
+
165
+ // Get thread origin offsets
166
+ uint h_pos = ndi.get_group (0 ) * BLOCK_WIDTH;
167
+ uint v_pos = ndi.get_group (1 ) * BLOCK_HEIGHT;
168
+
169
+ // Declare a 8x32 uchar matrix to store the input block pixel
170
+ // value
171
+ simd<unsigned char , 8 * 32 > in;
172
+
173
+ // Declare a vector to store the local histogram
174
+ simd<unsigned int , NUM_BINS> histogram (0 );
175
+
176
+ // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
177
+ for (int y = 0 ; y < BLOCK_HEIGHT / 8 ; y++) {
178
+ // Perform 2D media block read to load 8x32 pixel block
179
+ in = media_block_load<unsigned char , 8 , 32 >(readAcc, h_pos,
180
+ v_pos);
181
+
182
+ // Accumulate local histogram for each pixel value
170
183
#pragma unroll
171
- for (int i = 0 ; i < 8 ; i++) {
184
+ for (int i = 0 ; i < 8 ; i++) {
172
185
#pragma unroll
173
- for (int j = 0 ; j < 32 ; j++) {
174
- histogram.select <1 , 1 >(in[i * 32 + j]) += 1 ;
186
+ for (int j = 0 ; j < 32 ; j++) {
187
+ histogram.select <1 , 1 >(in[i * 32 + j]) += 1 ;
188
+ }
175
189
}
176
- }
177
190
178
- // Update starting offset for the next work block
179
- v_pos += 8 ;
180
- }
191
+ // Update starting offset for the next work block
192
+ v_pos += 8 ;
193
+ }
181
194
182
- // Declare a vector to store the offset for atomic write operation
183
- simd<unsigned int , 8 > offset (0 , 1 ); // init to 0, 1, 2, ..., 7
184
- offset *= sizeof (unsigned int );
195
+ // Declare a vector to store the offset for atomic write operation
196
+ simd<unsigned int , 8 > offset (0 , 1 ); // init to 0, 1, 2, ..., 7
197
+ offset *= sizeof (unsigned int );
185
198
186
- // Update global sum by atomically adding each local histogram
199
+ // Update global sum by atomically adding each local histogram
187
200
#pragma unroll
188
- for (int i = 0 ; i < NUM_BINS; i += 8 ) {
189
- // Declare a vector to store the source for atomic write operation
190
- simd<unsigned int , 8 > src;
191
- src = histogram.select <8 , 1 >(i);
201
+ for (int i = 0 ; i < NUM_BINS; i += 8 ) {
202
+ // Declare a vector to store the source for atomic write
203
+ // operation
204
+ simd<unsigned int , 8 > src;
205
+ src = histogram.select <8 , 1 >(i);
192
206
193
207
#ifdef __SYCL_DEVICE_ONLY__
194
- flat_atomic<atomic_op::add, unsigned int , 8 >(bins, offset, src,
195
- 1 );
196
- offset += 8 * sizeof (unsigned int );
208
+ flat_atomic<atomic_op::add, unsigned int , 8 >(bins, offset, src,
209
+ 1 );
210
+ offset += 8 * sizeof (unsigned int );
197
211
#else
198
- simd<unsigned int , 8 > vals;
199
- vals.copy_from (bins + i);
200
- vals = vals + src;
201
- vals.copy_to (bins + i);
212
+ simd<unsigned int , 8 > vals;
213
+ vals.copy_from (bins + i);
214
+ vals = vals + src;
215
+ vals.copy_to (bins + i);
202
216
#endif
203
- }
204
- });
205
- });
206
- e.wait ();
217
+ }
218
+ });
219
+ });
220
+ e.wait ();
221
+ etime = esimd_test::report_time (" kernel time" , e, e);
222
+ if (iter > 0 )
223
+ kernel_times += etime;
224
+ else
225
+ start = timer.Elapsed ();
226
+ }
207
227
208
228
// SYCL will enqueue and run the kernel. Recall that the buffer's data is
209
229
// given back to the host at the end of scope.
210
230
// make sure data is given back to the host at the end of this scope
211
231
} catch (cl::sycl::exception const &e) {
212
232
std::cout << " SYCL exception caught: " << e.what () << ' \n ' ;
213
- return e. get_cl_code () ;
233
+ return 1 ;
214
234
}
215
235
236
+ // End timer.
237
+ double end = timer.Elapsed ();
238
+
239
+ esimd_test::display_timing_stats (kernel_times, num_iters,
240
+ (end - start) * 1000 );
241
+
216
242
writeHist (bins);
217
243
writeHist (cpuHistogram);
218
244
// Checking Histogram
0 commit comments