[SYCL][E2E] Introduce performance tests (#12372)

aelovikov-intel · web-flow · commit 7af64dfab8cb · 2024-01-12T11:35:17.000-08:00
Just an additional directory under sycl/test-e2e that needs "llvm-lit
--param enable-perf-tests=1" to be executed and changes in post-commit
task to enable running those using a single worker and capturing output
even on PASSes.

I plan on modifying pre-commit to run the same given a label is provided
in a separate PR.

Better infrastructure, like using google benchmark or deeper integration
with CI, is not planned at the moment.
diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl_post_commit.yml
@@ -46,16 +46,33 @@ jobs:
           - name: Intel Arc A-Series Graphics with Level Zero
             runner: '["Linux", "arc"]'
             extra_lit_opts: --param matrix-xmx8=True --param gpu-intel-dg2=True
+          # Performance tests below. Specifics:
+          #  - only run performance tests (use LIT_FILTER env)
+          #  - ask llvm-lit to show all the output, even for PASS (-a)
+          #  - run in single thread (-j 1)
+          #  - enable the tests in LIT (--param enable-perf-tests=True)
+          #  - run on all available devices.
+          - name: Perf tests on Intel GEN12 Graphics system
+            runner: '["Linux", "gen12"]'
+            env: '{"LIT_FILTER":"PerformanceTests/"}'
+            extra_lit_opts: -a -j 1 --param enable-perf-tests=True
+            target_devices: all
+          - name: Perf tests on Intel Arc A-Series Graphics system
+            runner: '["Linux", "arc"]'
+            env: '{"LIT_FILTER":"PerformanceTests/"}'
+            extra_lit_opts: -a -j 1 --param enable-perf-tests=True
+            target_devices: all
     uses: ./.github/workflows/sycl_linux_run_tests.yml
     with:
       name: ${{ matrix.name }}
       runner: ${{ matrix. runner }}
       image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
       image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
-      target_devices: ext_oneapi_level_zero:gpu
+      target_devices: ${{ matrix.target_devices || 'ext_oneapi_level_zero:gpu' }}
       reset_gpu: true
 
       extra_lit_opts: ${{ matrix.extra_lit_opts }}
+      env: ${{ matrix.env || '{}' }}
 
       ref: ${{ github.sha }}
       merge_ref: ''
diff --git a/sycl/test-e2e/PerformanceTests/Reduction/reduce_over_sub_group.cpp b/sycl/test-e2e/PerformanceTests/Reduction/reduce_over_sub_group.cpp
@@ -0,0 +1,76 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/sycl.hpp>
+
+#include <sycl/ext/oneapi/experimental/user_defined_reductions.hpp>
+
+#include <iomanip>
+
+using namespace sycl;
+namespace sycl_exp = sycl::ext::oneapi::experimental;
+
+template <typename T> void test() {
+  std::cout << std::setw(50) << __PRETTY_FUNCTION__ << ", time:";
+  constexpr int WG_SIZE = 32 + 16 + 8 + 4;
+  constexpr int GLOBAL_SIZE = WG_SIZE * 1;
+
+  queue q;
+
+  buffer<T, 1> b{GLOBAL_SIZE};
+
+  for (int i = 0; i < 5; ++i) {
+    auto start = std::chrono::high_resolution_clock::now();
+    q.submit([&](handler &cgh) {
+       accessor acc{b, cgh};
+       size_t temp_memory_size = WG_SIZE * sizeof(T);
+       auto scratch = sycl::local_accessor<std::byte, 1>(temp_memory_size, cgh);
+
+       cgh.parallel_for(
+           nd_range<1>{range<1>{GLOBAL_SIZE}, range<1>{WG_SIZE}},
+           [=](nd_item<1> ndi) {
+             auto g = ndi.get_group();
+             auto sg = ndi.get_sub_group();
+             // sg's scratch space starts at sg leader's *group* linear id.
+             auto sg_scratch = sycl::span(
+                 &scratch[group_broadcast(sg, g.get_local_linear_id())],
+                 sizeof(T) * sg.get_local_linear_range());
+             auto handle = sycl_exp::group_with_scratchpad(sg, sg_scratch);
+             T val{0};
+             auto binop = [](T x, T y) { return x + y; };
+             for (int j = 0; j < 100000; ++j)
+               val += sycl_exp::reduce_over_group(
+                   handle, static_cast<T>(j % 100), binop);
+             acc[ndi.get_global_linear_id()] = val;
+           });
+     }).wait();
+    if (i == 0)
+      continue; // skip first iteration's overheads.
+    auto end = std::chrono::high_resolution_clock::now();
+    auto time =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+            .count();
+    std::cout << " " << std::setw(6) << time << "ms";
+  }
+  std::cout << std::endl;
+}
+
+int main() {
+  test<uint8_t>();
+  test<uint16_t>();
+  test<uint32_t>();
+  test<uint64_t>();
+
+  test<int8_t>();
+  test<int16_t>();
+  test<int32_t>();
+  test<int64_t>();
+
+  if (device{}.has(aspect::fp16))
+    test<half>();
+  test<float>();
+  if (device{}.has(aspect::fp64))
+    test<double>();
+
+  return 0;
+}
diff --git a/sycl/test-e2e/PerformanceTests/lit.local.cfg b/sycl/test-e2e/PerformanceTests/lit.local.cfg
@@ -0,0 +1 @@
+config.required_features += ['enable-perf-tests']
diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
@@ -183,6 +183,8 @@
     config.ur_l0_leaks_debug = lit_config.params.get("ur_l0_leaks_debug")
     lit_config.note("UR_L0_LEAKS_DEBUG: " + config.ur_l0_leaks_debug)
 
+if lit_config.params.get("enable-perf-tests", False):
+    config.available_features.add("enable-perf-tests")
 # Make sure that any dynamic checks below are done in the build directory and
 # not where the sources are located. This is important for the in-tree
 # configuration (as opposite to the standalone one).

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+config.required_features += ['enable-perf-tests']`