@@ -41,11 +41,11 @@ void histogram_CPU(unsigned int width, unsigned int height, unsigned char *srcY,
41
41
void writeHist (unsigned int *hist) {
42
42
int total = 0 ;
43
43
44
- std::cerr << " \n Histogram: \n " ;
44
+ // std::cerr << "\nHistogram: \n";
45
45
for (int i = 0 ; i < NUM_BINS; i += 8 ) {
46
- std::cerr << " \n [" << i << " - " << i + 7 << " ]:" ;
46
+ // std::cerr << "\n [" << i << " - " << i + 7 << "]:";
47
47
for (int j = 0 ; j < 8 ; j++) {
48
- std::cerr << " \t " << hist[i + j];
48
+ // std::cerr << "\t" << hist[i + j];
49
49
total += hist[i + j];
50
50
}
51
51
}
@@ -80,7 +80,8 @@ int main(int argc, char *argv[]) {
80
80
// Read in image luma plane
81
81
82
82
// Allocate Input Buffer
83
- queue q (esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler ());
83
+ queue q (esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler (),
84
+ property::queue::enable_profiling{});
84
85
85
86
auto dev = q.get_device ();
86
87
auto ctxt = q.get_context ();
@@ -121,10 +122,6 @@ int main(int argc, char *argv[]) {
121
122
}
122
123
}
123
124
124
- for (int i = 0 ; i < NUM_BINS; i++) {
125
- bins[i] = 0 ;
126
- }
127
-
128
125
// ------------------------------------------------------------------------
129
126
// CPU Execution:
130
127
@@ -136,76 +133,89 @@ int main(int argc, char *argv[]) {
136
133
image_channel_type::unsigned_int32,
137
134
range<2 >{width / sizeof (uint4), height});
138
135
136
+ // Launches the task on the GPU.
137
+ double kernel_times = 0 ;
138
+ unsigned num_iters = 10 ;
139
+
139
140
try {
140
- // create ranges
141
- // We need that many workitems
142
- auto GlobalRange = range<1 >(range_width * range_height);
143
- // Number of workitems in a workgroup
144
- auto LocalRange = range<1 >(1 );
145
- nd_range<1 > Range (GlobalRange, LocalRange);
146
-
147
- auto e = q.submit ([&](handler &cgh) {
148
- auto readAcc = Img.get_access <uint4, cl::sycl::access::mode::read>(cgh);
149
-
150
- cgh.parallel_for <class Hist >(
151
- Range, [=](nd_item<1 > ndi) SYCL_ESIMD_KERNEL {
152
- using namespace sycl ::INTEL::gpu;
153
-
154
- // Get thread origin offsets
155
- uint tid = ndi.get_group (0 );
156
- uint h_pos = (tid % range_width) * BLOCK_WIDTH;
157
- uint v_pos = (tid / range_width) * BLOCK_HEIGHT;
158
-
159
- // Declare a 8x32 uchar matrix to store the input block pixel value
160
- simd<unsigned char , 8 * 32 > in;
161
-
162
- // Declare a vector to store the local histogram
163
- simd<unsigned int , NUM_BINS> histogram (0 );
164
-
165
- // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
166
- for (int y = 0 ; y < BLOCK_HEIGHT / 8 ; y++) {
167
- // Perform 2D media block read to load 8x32 pixel block
168
- in =
169
- media_block_load<unsigned char , 8 , 32 >(readAcc, h_pos, v_pos);
170
-
171
- // Accumulate local histogram for each pixel value
141
+ for (int iter = 0 ; iter <= num_iters; ++iter) {
142
+ double etime = 0 ;
143
+ for (int b = 0 ; b < NUM_BINS; b++)
144
+ bins[b] = 0 ;
145
+ // create ranges
146
+ // We need that many workitems
147
+ auto GlobalRange = range<1 >(range_width * range_height);
148
+ // Number of workitems in a workgroup
149
+ auto LocalRange = range<1 >(1 );
150
+ nd_range<1 > Range (GlobalRange, LocalRange);
151
+
152
+ auto e = q.submit ([&](handler &cgh) {
153
+ auto readAcc = Img.get_access <uint4, cl::sycl::access::mode::read>(cgh);
154
+
155
+ cgh.parallel_for <class Hist >(
156
+ Range, [=](nd_item<1 > ndi) SYCL_ESIMD_KERNEL {
157
+ using namespace sycl ::INTEL::gpu;
158
+
159
+ // Get thread origin offsets
160
+ uint tid = ndi.get_group (0 );
161
+ uint h_pos = (tid % range_width) * BLOCK_WIDTH;
162
+ uint v_pos = (tid / range_width) * BLOCK_HEIGHT;
163
+
164
+ // Declare a 8x32 uchar matrix to store the input block pixel
165
+ // value
166
+ simd<unsigned char , 8 * 32 > in;
167
+
168
+ // Declare a vector to store the local histogram
169
+ simd<unsigned int , NUM_BINS> histogram (0 );
170
+
171
+ // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
172
+ for (int y = 0 ; y < BLOCK_HEIGHT / 8 ; y++) {
173
+ // Perform 2D media block read to load 8x32 pixel block
174
+ in = media_block_load<unsigned char , 8 , 32 >(readAcc, h_pos,
175
+ v_pos);
176
+
177
+ // Accumulate local histogram for each pixel value
172
178
#pragma unroll
173
- for (int i = 0 ; i < 8 ; i++) {
179
+ for (int i = 0 ; i < 8 ; i++) {
174
180
#pragma unroll
175
- for (int j = 0 ; j < 32 ; j++) {
176
- histogram.select <1 , 1 >(in[i * 32 + j]) += 1 ;
181
+ for (int j = 0 ; j < 32 ; j++) {
182
+ histogram.select <1 , 1 >(in[i * 32 + j]) += 1 ;
183
+ }
177
184
}
178
- }
179
185
180
- // Update starting offset for the next work block
181
- v_pos += 8 ;
182
- }
186
+ // Update starting offset for the next work block
187
+ v_pos += 8 ;
188
+ }
183
189
184
- // Declare a vector to store the offset for atomic write operation
185
- simd<unsigned int , 8 > offset (0 , 1 ); // init to 0, 1, 2, ..., 7
186
- offset *= sizeof (unsigned int );
190
+ // Declare a vector to store the offset for atomic write operation
191
+ simd<unsigned int , 8 > offset (0 , 1 ); // init to 0, 1, 2, ..., 7
192
+ offset *= sizeof (unsigned int );
187
193
188
- // Update global sum by atomically adding each local histogram
194
+ // Update global sum by atomically adding each local histogram
189
195
#pragma unroll
190
- for (int i = 0 ; i < NUM_BINS; i += 8 ) {
191
- // Declare a vector to store the source for atomic write operation
192
- simd<unsigned int , 8 > src;
193
- src = histogram.select <8 , 1 >(i);
196
+ for (int i = 0 ; i < NUM_BINS; i += 8 ) {
197
+ // Declare a vector to store the source for atomic write
198
+ // operation
199
+ simd<unsigned int , 8 > src;
200
+ src = histogram.select <8 , 1 >(i);
194
201
195
202
#ifdef __SYCL_DEVICE_ONLY__
196
- flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int , 8 >(
197
- bins, offset, src, 1 );
198
- offset += 8 * sizeof (unsigned int );
203
+ flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int , 8 >(
204
+ bins, offset, src, 1 );
205
+ offset += 8 * sizeof (unsigned int );
199
206
#else
200
207
auto vals = block_load<unsigned int , 8 >(bins + i);
201
208
vals = vals + src;
202
209
block_store<unsigned int , 8 >(bins + i, vals);
203
210
#endif
204
- }
205
- });
206
- });
207
- e.wait ();
208
-
211
+ }
212
+ });
213
+ });
214
+ e.wait ();
215
+ etime = esimd_test::report_time (" kernel time" , e, e);
216
+ if (iter > 0 )
217
+ kernel_times += etime;
218
+ }
209
219
// SYCL will enqueue and run the kernel. Recall that the buffer's data is
210
220
// given back to the host at the end of scope.
211
221
// make sure data is given back to the host at the end of this scope
@@ -214,6 +224,9 @@ int main(int argc, char *argv[]) {
214
224
return e.get_cl_code ();
215
225
}
216
226
227
+ float kernel_time = kernel_times / num_iters;
228
+ std::cerr << " GPU kernel time = " << kernel_time << " msec\n " ;
229
+
217
230
writeHist (bins);
218
231
writeHist (cpuHistogram);
219
232
// Checking Histogram
0 commit comments