intel · kbobrovs · Oct 21, 2021 · Oct 20, 2021 · Oct 21, 2021 · Oct 21, 2021
@@ -190,4 +190,12 @@ inline double report_time(const std::string &msg, event e0, event en) {
   return elapsed;
 }
 
+void display_timing_stats(double const kernelTime,
+                          unsigned int const uiNumberOfIterations,
+                          double const overallTime) {
+  std::cout << "Number of iterations: " << uiNumberOfIterations << "\n";
+  std::cout << "[KernelTime]:" << kernelTime << "\n";
+  std::cout << "[OverallTime][Primary]:" << overallTime << "\n";
+}
+
 } // namespace esimd_test
@@ -132,11 +132,16 @@ int main(int argc, char *argv[]) {
                          image_channel_type::unsigned_int32,
                          range<2>{width / sizeof(uint4), height});
 
+  // Start Timer
+  esimd_test::Timer timer;
+  double start;
+
   // Launches the task on the GPU.
   double kernel_times = 0;
   unsigned num_iters = 10;
 
   try {
+    // num_iters + 1, iteration#0 is for warmup
     for (int iter = 0; iter <= num_iters; ++iter) {
       double etime = 0;
       for (int b = 0; b < NUM_BINS; b++)
@@ -215,6 +220,8 @@ int main(int argc, char *argv[]) {
       etime = esimd_test::report_time("kernel time", e, e);
       if (iter > 0)
         kernel_times += etime;
+      else
+        start = timer.Elapsed();
     }
     // SYCL will enqueue and run the kernel. Recall that the buffer's data is
     // given back to the host at the end of scope.
@@ -224,8 +231,13 @@ int main(int argc, char *argv[]) {
     return e.get_cl_code();
   }
 
-  float kernel_time = kernel_times / num_iters;
-  std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
+  // End timer.
+  double end = timer.Elapsed();
+
+  esimd_test::display_timing_stats(kernel_times, num_iters,
+                                   (end - start) * 1000);
+  // float kernel_time = kernel_times / num_iters;
+  // std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
 
   writeHist(bins);
   writeHist(cpuHistogram);

@@ -156,6 +156,10 @@ int main() {
   auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
   cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
 
+  // Start Timer
+  esimd_test::Timer timer;
+  double start;
+
   // Launches the task on the GPU.
   double kernel_times = 0;
   unsigned num_iters = 10;
@@ -174,14 +178,21 @@ int main() {
       etime = esimd_test::report_time("kernel time", e, e);
       if (iter > 0)
         kernel_times += etime;
+      else
+        start = timer.Elapsed();
     }
   } catch (cl::sycl::exception const &e) {
     std::cout << "SYCL exception caught: " << e.what() << '\n';
     return e.get_cl_code();
   }
 
-  float kernel_time = kernel_times / num_iters;
-  std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
+  // End timer.
+  double end = timer.Elapsed();
+
+  esimd_test::display_timing_stats(kernel_times, num_iters,
+                                   (end - start) * 1000);
+  // float kernel_time = kernel_times / num_iters;
+  // std::cerr << "GPU kernel time = " << kernel_time << " msec\n";
 
   std::cout << "finish GPU histogram\n";
 

@@ -160,23 +160,46 @@ int main(int argc, char **argv) {
   auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
   cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
 
+  // Start Timer
+  esimd_test::Timer timer;
+  double start;
+
+  double kernel_times = 0;
+  unsigned num_iters = 10;
   try {
-    auto e = q.submit([&](cl::sycl::handler &cgh) {
-      cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
-      cgh.parallel_for<histogram_slm>(
-          Range,
-          [=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
-            histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
-                             ndi.get_local_id(0), 16,
-                             kh.get_specialization_constant<NumBlocksSpecId>());
-          });
-    });
-    e.wait();
+    // num_iters + 1, iteration#0 is for warmup
+    for (int iter = 0; iter <= num_iters; ++iter) {
+      double etime = 0;
+      memset(output_surface, 0, 4 * NUM_BINS);
+      auto e = q.submit([&](cl::sycl::handler &cgh) {
+        cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
+        cgh.parallel_for<histogram_slm>(
+            Range,
+            [=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
+              histogram_atomic(
+                  input_ptr, output_surface, ndi.get_group(0),
+                  ndi.get_local_id(0), 16,
+                  kh.get_specialization_constant<NumBlocksSpecId>());
+            });
+      });
+      e.wait();
+      etime = esimd_test::report_time("kernel time", e, e);
+      if (iter > 0)
+        kernel_times += etime;
+      else
+        start = timer.Elapsed();
+    }
   } catch (cl::sycl::exception const &e) {
     std::cout << "SYCL exception caught: " << e.what() << '\n';
     return e.get_cl_code();
   }
 
+  // End timer.
+  double end = timer.Elapsed();
+
+  esimd_test::display_timing_stats(kernel_times, num_iters,
+                                   (end - start) * 1000);
+
   std::cout << "finish GPU histogram\n";
 
   memcpy(hist, output_surface, 4 * NUM_BINS);

@@ -135,75 +135,94 @@ int main(int argc, char *argv[]) {
                          image_channel_type::unsigned_int32,
                          range<2>{width / sizeof(uint4), height});
 
-  try {
-    // create ranges
-    // We need that many workitems
-    auto GlobalRange = range<2>(range_width, range_height);
-    // Number of workitems in a workgroup
-    auto LocalRange = range<2>(1, 1);
-    nd_range<2> Range(GlobalRange, LocalRange);
-
-    auto e = q.submit([&](handler &cgh) {
-      auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
-
-      cgh.parallel_for<class Hist>(
-          Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
-            using namespace sycl::ext::intel::experimental::esimd;
-
-            // Get thread origin offsets
-            uint h_pos = ndi.get_group(0) * BLOCK_WIDTH;
-            uint v_pos = ndi.get_group(1) * BLOCK_HEIGHT;
-
-            // Declare a 8x32 uchar matrix to store the input block pixel value
-            simd<unsigned char, 8 * 32> in;
-
-            // Declare a vector to store the local histogram
-            simd<unsigned int, NUM_BINS> histogram(0);
-
-            // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
-            for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
-              // Perform 2D media block read to load 8x32 pixel block
-              in =
-                  media_block_load<unsigned char, 8, 32>(readAcc, h_pos, v_pos);
+  // Start Timer
+  esimd_test::Timer timer;
+  double start;
 
-          // Accumulate local histogram for each pixel value
+  double kernel_times = 0;
+  unsigned num_iters = 10;
+  try {
+    // num_iters + 1, iteration#0 is for warmup
+    for (int iter = 0; iter <= num_iters; ++iter) {
+      double etime = 0;
+      for (int b = 0; b < NUM_BINS; b++)
+        bins[b] = 0;
+      // create ranges
+      // We need that many workitems
+      auto GlobalRange = range<2>(range_width, range_height);
+      // Number of workitems in a workgroup
+      auto LocalRange = range<2>(1, 1);
+      nd_range<2> Range(GlobalRange, LocalRange);
+
+      auto e = q.submit([&](handler &cgh) {
+        auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
+
+        cgh.parallel_for<class Hist>(
+            Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
+              using namespace sycl::ext::intel::experimental::esimd;
+
+              // Get thread origin offsets
+              uint h_pos = ndi.get_group(0) * BLOCK_WIDTH;
+              uint v_pos = ndi.get_group(1) * BLOCK_HEIGHT;
+
+              // Declare a 8x32 uchar matrix to store the input block pixel
+              // value
+              simd<unsigned char, 8 * 32> in;
+
+              // Declare a vector to store the local histogram
+              simd<unsigned int, NUM_BINS> histogram(0);
+
+              // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
+              for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
+                // Perform 2D media block read to load 8x32 pixel block
+                in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
+                                                            v_pos);
+
+            // Accumulate local histogram for each pixel value
 #pragma unroll
-              for (int i = 0; i < 8; i++) {
+                for (int i = 0; i < 8; i++) {
 #pragma unroll
-                for (int j = 0; j < 32; j++) {
-                  histogram.select<1, 1>(in[i * 32 + j]) += 1;
+                  for (int j = 0; j < 32; j++) {
+                    histogram.select<1, 1>(in[i * 32 + j]) += 1;
+                  }
                 }
-              }
 
-              // Update starting offset for the next work block
-              v_pos += 8;
-            }
+                // Update starting offset for the next work block
+                v_pos += 8;
+              }
 
-            // Declare a vector to store the offset for atomic write operation
-            simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
-            offset *= sizeof(unsigned int);
+              // Declare a vector to store the offset for atomic write operation
+              simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
+              offset *= sizeof(unsigned int);
 
-        // Update global sum by atomically adding each local histogram
+          // Update global sum by atomically adding each local histogram
 #pragma unroll
-            for (int i = 0; i < NUM_BINS; i += 8) {
-              // Declare a vector to store the source for atomic write operation
-              simd<unsigned int, 8> src;
-              src = histogram.select<8, 1>(i);
+              for (int i = 0; i < NUM_BINS; i += 8) {
+                // Declare a vector to store the source for atomic write
+                // operation
+                simd<unsigned int, 8> src;
+                src = histogram.select<8, 1>(i);
 
 #ifdef __SYCL_DEVICE_ONLY__
-              flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
-                                                           1);
-              offset += 8 * sizeof(unsigned int);
+                flat_atomic<atomic_op::add, unsigned int, 8>(bins, offset, src,
+                                                             1);
+                offset += 8 * sizeof(unsigned int);
 #else
-              simd<unsigned int, 8> vals;
-              vals.copy_from(bins + i);
-              vals = vals + src;
-              vals.copy_to(bins + i);
+                simd<unsigned int, 8> vals;
+                vals.copy_from(bins + i);
+                vals = vals + src;
+                vals.copy_to(bins + i);
 #endif
-            }
-          });
-    });
-    e.wait();
+              }
+            });
+      });
+      e.wait();
+      etime = esimd_test::report_time("kernel time", e, e);
+      if (iter > 0)
+        kernel_times += etime;
+      else
+        start = timer.Elapsed();
+    }
 
     // SYCL will enqueue and run the kernel. Recall that the buffer's data is
     // given back to the host at the end of scope.
@@ -213,6 +232,12 @@ int main(int argc, char *argv[]) {
     return e.get_cl_code();
   }
 
+  // End timer.
+  double end = timer.Elapsed();
+
+  esimd_test::display_timing_stats(kernel_times, num_iters,
+                                   (end - start) * 1000);
+
   writeHist(bins);
   writeHist(cpuHistogram);
   // Checking Histogram