intel · vladimirlaz · Feb 18, 2021 · Feb 16, 2021
@@ -506,16 +506,6 @@ ESIMD_INLINE void cmk_bitonic_merge(AccTy buf, uint32_t n, uint32_t m,
   }
 }
 
-static double report_time(const string &msg, event e0, event en) {
-  cl_ulong time_start =
-      e0.get_profiling_info<info::event_profiling::command_start>();
-  cl_ulong time_end =
-      en.get_profiling_info<info::event_profiling::command_end>();
-  double elapsed = (time_end - time_start) / 1e6;
-  cout << msg << elapsed << " milliseconds" << std::endl;
-  return elapsed;
-}
-
 struct BitonicSort {
   enum {
     base_sort_size_ = 256,
@@ -615,7 +605,7 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
                                       });
     });
     e.wait();
-    total_time += report_time("kernel time", e, e);
+    total_time += esimd_test::report_time("kernel time", e, e);
   } catch (cl::sycl::exception const &e) {
     std::cout << "SYCL exception caught: " << e.what() << '\n';
     return e.get_cl_code();
@@ -663,7 +653,8 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
   }
 
   mergeEvent[k - 1].wait();
-  total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
+  total_time +=
+      esimd_test::report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
 
   cout << " Sorting Time = " << total_time << " msec " << std::endl;
   return 1;

@@ -421,16 +421,6 @@ ESIMD_INLINE void cmk_bitonic_merge(uint32_t *buf, uint32_t n, uint32_t m,
   }
 }
 
-static double report_time(const string &msg, event e0, event en) {
-  cl_ulong time_start =
-      e0.get_profiling_info<info::event_profiling::command_start>();
-  cl_ulong time_end =
-      en.get_profiling_info<info::event_profiling::command_end>();
-  double elapsed = (time_end - time_start) / 1e6;
-  cout << msg << elapsed << " milliseconds" << std::endl;
-  return elapsed;
-}
-
 struct BitonicSort {
   enum {
     base_sort_size_ = 256,
@@ -532,7 +522,7 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
           });
     });
     e.wait();
-    total_time += report_time("kernel time", e, e);
+    total_time += esimd_test::report_time("kernel time", e, e);
   } catch (cl::sycl::exception const &e) {
     std::cout << "SYCL exception caught: " << e.what() << '\n';
     return e.get_cl_code();
@@ -580,7 +570,8 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
   }
 
   mergeEvent[k - 1].wait();
-  total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
+  total_time +=
+      esimd_test::report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
 
   cout << " Sorting Time = " << total_time << " msec " << std::endl;
   return 1;

@@ -11,6 +11,7 @@
 #define NOMINMAX
 
 #include <algorithm>
+#include <chrono>
 #include <cstring>
 #include <fstream>
 #include <iostream>
@@ -161,4 +162,32 @@ To bit_cast(const From &src) noexcept {
   return dst;
 }
 
+// Timer class for measuring elasped time
+class Timer {
+public:
+  Timer() : start_(std::chrono::steady_clock::now()) {}
+
+  double Elapsed() {
+    auto now = std::chrono::steady_clock::now();
+    return std::chrono::duration_cast<Duration>(now - start_).count();
+  }
+
+private:
+  using Duration = std::chrono::duration<double>;
+  std::chrono::steady_clock::time_point start_;
+};
+
+// e0 is the first event, en is the last event
+// find the time difference between the starting time of the e0 and
+// the ending time of en, return micro-second
+inline double report_time(const std::string &msg, event e0, event en) {
+  cl_ulong time_start =
+      e0.get_profiling_info<info::event_profiling::command_start>();
+  cl_ulong time_end =
+      en.get_profiling_info<info::event_profiling::command_end>();
+  double elapsed = (time_end - time_start) / 1e6;
+  // cerr << msg << elapsed << " msecs" << std::endl;
+  return elapsed;
+}
+
 } // namespace esimd_test
@@ -41,11 +41,11 @@ void histogram_CPU(unsigned int width, unsigned int height, unsigned char *srcY,
 void writeHist(unsigned int *hist) {
   int total = 0;
 
-  std::cerr << "\nHistogram: \n";
+  // std::cerr << "\nHistogram: \n";
   for (int i = 0; i < NUM_BINS; i += 8) {
-    std::cerr << "\n  [" << i << " - " << i + 7 << "]:";
+    // std::cerr << "\n  [" << i << " - " << i + 7 << "]:";
     for (int j = 0; j < 8; j++) {
-      std::cerr << "\t" << hist[i + j];
+      // std::cerr << "\t" << hist[i + j];
       total += hist[i + j];
     }
   }
@@ -80,7 +80,8 @@ int main(int argc, char *argv[]) {
   // Read in image luma plane
 
   // Allocate Input Buffer
-  queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
+  queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
+          property::queue::enable_profiling{});
 
   auto dev = q.get_device();
   auto ctxt = q.get_context();
@@ -121,10 +122,6 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  for (int i = 0; i < NUM_BINS; i++) {
-    bins[i] = 0;
-  }
-
   // ------------------------------------------------------------------------
   // CPU Execution:
 
@@ -136,76 +133,89 @@ int main(int argc, char *argv[]) {
                          image_channel_type::unsigned_int32,
                          range<2>{width / sizeof(uint4), height});
 
+  // Launches the task on the GPU.
+  double kernel_times = 0;
+  unsigned num_iters = 10;
+
   try {
-    // create ranges
-    // We need that many workitems
-    auto GlobalRange = range<1>(range_width * range_height);
-    // Number of workitems in a workgroup
-    auto LocalRange = range<1>(1);
-    nd_range<1> Range(GlobalRange, LocalRange);
-
-    auto e = q.submit([&](handler &cgh) {
-      auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
-
-      cgh.parallel_for<class Hist>(
-          Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
-            using namespace sycl::INTEL::gpu;
-
-            // Get thread origin offsets
-            uint tid = ndi.get_group(0);
-            uint h_pos = (tid % range_width) * BLOCK_WIDTH;
-            uint v_pos = (tid / range_width) * BLOCK_HEIGHT;
-
-            // Declare a 8x32 uchar matrix to store the input block pixel value
-            simd<unsigned char, 8 * 32> in;
-
-            // Declare a vector to store the local histogram
-            simd<unsigned int, NUM_BINS> histogram(0);
-
-            // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
-            for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
-              // Perform 2D media block read to load 8x32 pixel block
-              in =
-                  media_block_load<unsigned char, 8, 32>(readAcc, h_pos, v_pos);
-
-          // Accumulate local histogram for each pixel value
+    for (int iter = 0; iter <= num_iters; ++iter) {
+      double etime = 0;
+      for (int b = 0; b < NUM_BINS; b++)
+        bins[b] = 0;
+      // create ranges
+      // We need that many workitems
+      auto GlobalRange = range<1>(range_width * range_height);
+      // Number of workitems in a workgroup
+      auto LocalRange = range<1>(1);
+      nd_range<1> Range(GlobalRange, LocalRange);
+
+      auto e = q.submit([&](handler &cgh) {
+        auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
+
+        cgh.parallel_for<class Hist>(
+            Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+              using namespace sycl::INTEL::gpu;
+
+              // Get thread origin offsets
+              uint tid = ndi.get_group(0);
+              uint h_pos = (tid % range_width) * BLOCK_WIDTH;
+              uint v_pos = (tid / range_width) * BLOCK_HEIGHT;
+
+              // Declare a 8x32 uchar matrix to store the input block pixel
+              // value
+              simd<unsigned char, 8 * 32> in;
+
+              // Declare a vector to store the local histogram
+              simd<unsigned int, NUM_BINS> histogram(0);
+
+              // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
+              for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
+                // Perform 2D media block read to load 8x32 pixel block
+                in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
+                                                            v_pos);
+
+            // Accumulate local histogram for each pixel value
 #pragma unroll
-              for (int i = 0; i < 8; i++) {
+                for (int i = 0; i < 8; i++) {
 #pragma unroll
-                for (int j = 0; j < 32; j++) {
-                  histogram.select<1, 1>(in[i * 32 + j]) += 1;
+                  for (int j = 0; j < 32; j++) {
+                    histogram.select<1, 1>(in[i * 32 + j]) += 1;
+                  }
                 }
-              }
 
-              // Update starting offset for the next work block
-              v_pos += 8;
-            }
+                // Update starting offset for the next work block
+                v_pos += 8;
+              }
 
-            // Declare a vector to store the offset for atomic write operation
-            simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
-            offset *= sizeof(unsigned int);
+              // Declare a vector to store the offset for atomic write operation
+              simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
+              offset *= sizeof(unsigned int);
 
-        // Update global sum by atomically adding each local histogram
+          // Update global sum by atomically adding each local histogram
 #pragma unroll
-            for (int i = 0; i < NUM_BINS; i += 8) {
-              // Declare a vector to store the source for atomic write operation
-              simd<unsigned int, 8> src;
-              src = histogram.select<8, 1>(i);
+              for (int i = 0; i < NUM_BINS; i += 8) {
+                // Declare a vector to store the source for atomic write
+                // operation
+                simd<unsigned int, 8> src;
+                src = histogram.select<8, 1>(i);
 
 #ifdef __SYCL_DEVICE_ONLY__
-              flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>(
-                  bins, offset, src, 1);
-              offset += 8 * sizeof(unsigned int);
+                flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>(
+                    bins, offset, src, 1);
+                offset += 8 * sizeof(unsigned int);
 #else
               auto vals = block_load<unsigned int, 8>(bins + i);
               vals = vals + src;
               block_store<unsigned int, 8>(bins + i, vals);
 #endif
-            }
-          });
-    });
-    e.wait();
-
+              }
+            });
+      });
+      e.wait();
+      etime = esimd_test::report_time("kernel time", e, e);
+      if (iter > 0)
+        kernel_times += etime;
+    }
     // SYCL will enqueue and run the kernel. Recall that the buffer's data is
     // given back to the host at the end of scope.
     // make sure data is given back to the host at the end of this scope
@@ -214,6 +224,9 @@ int main(int argc, char *argv[]) {
     return e.get_cl_code();
   }
 
+  float kernel_time = kernel_times / num_iters;
+  std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
+
   writeHist(bins);
   writeHist(cpuHistogram);
   // Checking Histogram

@@ -102,7 +102,8 @@ int CheckHistogram(unsigned int *cpu_histogram, unsigned int *gpu_histogram) {
 }
 
 int main() {
-  queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
+  queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
+          property::queue::enable_profiling{});
   auto dev = q.get_device();
   auto ctxt = q.get_context();
 
@@ -146,7 +147,6 @@ int main() {
   // Uses the GPU to calculate the histogram output data.
   unsigned int *output_surface =
       (uint32_t *)malloc_shared(4 * NUM_BINS, dev, ctxt);
-  memset(output_surface, 0, 4 * NUM_BINS);
 
   unsigned int num_threads;
   num_threads = width * height / (NUM_BLOCKS * BLOCK_WIDTH * sizeof(int));
@@ -155,20 +155,33 @@ int main() {
   auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
   cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
 
+  // Launches the task on the GPU.
+  double kernel_times = 0;
+  unsigned num_iters = 10;
   try {
-    auto e = q.submit([&](cl::sycl::handler &cgh) {
-      cgh.parallel_for<class histogram_slm>(
-          Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
-            histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
-                             ndi.get_local_id(0), 16);
-          });
-    });
-    e.wait();
+    for (int iter = 0; iter <= num_iters; ++iter) {
+      double etime = 0;
+      memset(output_surface, 0, 4 * NUM_BINS);
+      auto e = q.submit([&](cl::sycl::handler &cgh) {
+        cgh.parallel_for<class histogram_slm>(
+            Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+              histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
+                               ndi.get_local_id(0), 16);
+            });
+      });
+      e.wait();
+      etime = esimd_test::report_time("kernel time", e, e);
+      if (iter > 0)
+        kernel_times += etime;
+    }
   } catch (cl::sycl::exception const &e) {
     std::cout << "SYCL exception caught: " << e.what() << '\n';
     return e.get_cl_code();
   }
 
+  float kernel_time = kernel_times / num_iters;
+  std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
+
   std::cout << "finish GPU histogram\n";
 
   memcpy(hist, output_surface, 4 * NUM_BINS);