[ESIMD] E2E test for slm_allocator API. (intel#1449)

kbobrovs · web-flow · commit adbf7abea074 · 2023-01-01T16:36:42.000-06:00
* [ESIMD] E2E test for slm_allocator API.

Signed-off-by: Konstantin S Bobrovsky &lt;konstantin.s.bobrovsky@intel.com&gt;
diff --git a/SYCL/ESIMD/slm_alloc.cpp b/SYCL/ESIMD/slm_alloc.cpp
@@ -0,0 +1,186 @@
+// REQUIRES: gpu
+// UNSUPPORTED: cuda || hip
+//
+// RUN: %clangxx -fsycl %s -o %t.1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.1.out
+//
+// Vary the test case by forcing inlining of the functions with slm_allocator:
+// RUN: %clangxx -fsycl -DFORCE_INLINE %s -o %t.2.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.2.out
+
+// This is end-to-end test for the slm_allocator API used together with the
+// slm_init. The call graph is:
+//         Test1(kernel) - uses slm_init(SLM_IN_KERNEL)
+//         /    \
+//        /      v
+//       /       bar - uses slm_allocator(SLM_IN_BAR)
+//      v
+//      foo - uses slm_allocator(SLM_IN_FOO)
+// Test1 kernel SLM usage is SLM_IN_KERNEL + max(SLM_IN_BAR, SLM_IN_FOO).
+// SLM offset returned by the slm_allocator in foo and bar is the same and is
+// SLM_IN_KERNEL bytes.
+// Bar uses slightly bigger SLM frame than foo. It modifies values (adds 10) in
+// SLM resulting from foo, plus appends couple more '100's.
+
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::intel::esimd;
+using namespace sycl::ext::intel::experimental::esimd;
+
+using T = int;
+constexpr int LOCAL_SIZE = 4;
+constexpr int GLOBAL_SIZE = 8;
+
+constexpr int NUM_WGS = GLOBAL_SIZE / LOCAL_SIZE;
+
+constexpr int ELEM_SIZE = sizeof(T);
+
+constexpr int SLM_IN_KERNEL = LOCAL_SIZE * ELEM_SIZE;
+constexpr int SLM_IN_FOO = LOCAL_SIZE * ELEM_SIZE;
+constexpr int BAR_EXTRA_ELEMS = 2;
+constexpr int SLM_IN_BAR = SLM_IN_FOO + BAR_EXTRA_ELEMS * ELEM_SIZE;
+constexpr int SLM_TOTAL = SLM_IN_KERNEL + std::max(SLM_IN_FOO, SLM_IN_BAR);
+constexpr int BAR_MARKER1 = 10;
+constexpr int BAR_MARKER2 = 100;
+
+#ifdef FORCE_INLINE
+constexpr bool force_inline = true;
+inline
+    __attribute__((always_inline))
+#else
+constexpr bool force_inline = false;
+__attribute__((noinline))
+#endif // FORCE_INLINE
+    void
+    foo(int local_id) {
+  slm_allocator<SLM_IN_FOO> a;
+  uint32_t slm_off = a.get_offset();
+  // write data chunk "Y":
+  slm_scalar_store(slm_off + local_id * ELEM_SIZE, (T)local_id);
+}
+
+#ifdef FORCE_INLINE
+inline
+    __attribute__((always_inline))
+#else
+__attribute__((noinline))
+#endif // FORCE_INLINE
+    void
+    bar(int local_id) {
+  slm_allocator<SLM_IN_BAR> a;
+  uint32_t slm_off = a.get_offset();
+  uint32_t off = slm_off + local_id * ELEM_SIZE;
+  T v = slm_scalar_load<T>(off);
+  // update data chunk "Y":
+  slm_scalar_store(off, v + BAR_MARKER1);
+
+  if (local_id == 0) {
+    for (int i = 0; i < BAR_EXTRA_ELEMS; i++) {
+      // write data chunk "Z":
+      slm_scalar_store((2 * LOCAL_SIZE + i) * ELEM_SIZE, (T)BAR_MARKER2);
+    }
+  }
+}
+
+int main(void) {
+  queue q;
+  auto dev = q.get_device();
+  std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+  std::cout << "force_inline=" << force_inline << "\n";
+  auto ctxt = q.get_context();
+  uint32_t size = SLM_TOTAL * NUM_WGS / ELEM_SIZE;
+
+  T *arr = malloc_shared<T>(size, dev, ctxt);
+
+  auto e = q.submit([&](handler &cgh) {
+    cgh.parallel_for<class Test1>(
+        nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE),
+        [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+          slm_init(SLM_IN_KERNEL);
+          int local_id = ndi.get_local_linear_id();
+          int group_id = ndi.get_group_linear_id();
+
+          // write data chunk "X":
+          slm_scalar_store(local_id * ELEM_SIZE, local_id);
+          barrier();
+
+          foo(local_id);
+          barrier();
+
+          bar(local_id);
+          barrier();
+
+          // copy data from SLM to the output for further verification
+          if (local_id == 0) {
+            uint32_t group_off = SLM_TOTAL * group_id;
+
+            for (int i = 0; i < SLM_TOTAL / ELEM_SIZE; i++) {
+              uint32_t slm_off = i * ELEM_SIZE;
+              uint32_t mem_off = group_off + slm_off;
+              scatter(arr, simd<uint32_t, 1>(mem_off),
+                      simd<T, 1>(slm_scalar_load<T>(slm_off)));
+            }
+          }
+        });
+  });
+  e.wait();
+
+  for (int i = 0; i < NUM_WGS * SLM_TOTAL / ELEM_SIZE; i++) {
+    std::cout << " " << arr[i];
+    if ((i + 1) % 10 == 0) {
+      std::cout << "\n";
+    }
+  }
+  std::cout << "\n";
+  int err_cnt = 0;
+
+  for (int g = 0; g < NUM_WGS; g++) {
+    uint32_t group_off = SLM_TOTAL * g / ELEM_SIZE;
+    for (int i = 0; i < LOCAL_SIZE; i++) {
+      int ind = group_off + i;
+
+      // check data copied from kernel's SLM frame ("X")
+      auto test = arr[ind];
+      auto gold = i;
+
+      if (test != gold) {
+        if (++err_cnt < 10) {
+          std::cerr << "*** ERROR (X) at " << ind << ": " << test
+                    << " != " << gold << " (gold)\n";
+        }
+      }
+      // check data copied from the overlapping part of foo's and bar's SLM
+      // frames - "Y"
+      ind = ind + LOCAL_SIZE; // shift to the foo/bar SLM frame
+      test = arr[ind];
+      gold = i + BAR_MARKER1;
+
+      if (test != gold) {
+        if (++err_cnt < 10) {
+          std::cerr << "*** ERROR (Y) at " << ind << ": " << test
+                    << " != " << gold << " (gold)\n";
+        }
+      }
+    }
+    // now check data written by bar past the overlapping part of foo/bar SLM
+    // frame - "Z"
+    for (int i = 0; i < BAR_EXTRA_ELEMS; i++) {
+      int ind =
+          group_off + 2 /*kernel's and foo's SLM frames*/ * LOCAL_SIZE + i;
+      auto test = arr[ind];
+      auto gold = BAR_MARKER2;
+
+      if (test != gold) {
+        if (++err_cnt < 10) {
+          std::cerr << "*** ERROR (Z) at " << ind << ": " << test
+                    << " != " << gold << " (gold)\n";
+        }
+      }
+    }
+  }
+  std::cout << (err_cnt ? "FAILED\n" : "Passed\n");
+  return err_cnt ? 1 : 0;
+}
diff --git a/SYCL/ESIMD/slm_alloc_many_kernels_many_funcs.cpp b/SYCL/ESIMD/slm_alloc_many_kernels_many_funcs.cpp
@@ -0,0 +1,204 @@
+// REQUIRES: gpu
+// UNSUPPORTED: cuda || hip
+//
+// RUN: %clangxx -fsycl %s -o %t.1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.1.out
+//
+// Vary the test case by forcing inlining of the functions with slm_allocator:
+// RUN: %clangxx -fsycl -DFORCE_INLINE %s -o %t.2.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.2.out
+
+// Checks validity of SLM frame offsets in case of complex call graph with two
+// kernels and 2 functions all using SLM, and one of the functions using two
+// slm_allocator objects with nested liveness ranges.
+
+// Hierarchy of SLM frames:
+//   N1      N2
+//   | \     |
+//   |  \    |
+//   v   \   |
+//   X    \  |
+//   | \   \ |
+//   |  \   \|
+//   v   \---v
+//   Y       Z
+//
+// SLM offsets are expected to be:
+// 1) no inlining case:
+// --- Kernel0
+// X  - N1
+// Y  - N1 + X
+// Z  - max(N1 + X, N2)
+// --- Kernel2
+// X  - 0 (not reachable, offset not updated in the result)
+// Y  - 0 (not reachable, offset not updated in the result)
+// Z  - max(N1 + X, N2)
+// 2) forced inlining case:
+// --- Kernel0
+// X  - N1
+// Y  - N1 + X
+// Z  - N1 // this is because Z (bar) is inlined into X (foo) and into
+//         // N1 (kernel1), and execution of the second inlined scope
+//         // allocation and offset recording into the result happens last.
+// --- Kernel2
+// X  - 0 (not reachable, offset not updated in the result)
+// Y  - 0 (not reachable, offset not updated in the result)
+// Z  - N2
+// Note the difference in SLM offset for Z in the inlining/no-inlining cases for
+// Z scope. This is because inlining effectively splits Z scope when inlining
+// into kernel0 and kernel2
+
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+#include <cstring>
+#include <iostream>
+
+using namespace sycl;
+using namespace sycl::ext::intel::esimd;
+using namespace sycl::ext::intel::experimental::esimd;
+
+using T = uint32_t;
+
+constexpr int SLM_N1 = 7;
+constexpr int SLM_N2 = 1;
+constexpr int SLM_X = 8;
+constexpr int SLM_Y = 16;
+constexpr int SLM_Z = 4;
+
+constexpr int LOCAL_SIZE = 2;
+constexpr int GLOBAL_SIZE = 2;
+
+template <class T> void scalar_store(T *base, uint32_t off, T val) {
+  scatter<T, 1>(base, simd<uint32_t, 1>(off * sizeof(T)), val);
+}
+
+// Result array format
+// |---- kernel0 ----|  |---- kernel2 ----|
+// x_off, y_off, z_off, x_off, y_off, z_off
+
+// Offsets in the result sub-array, to store each checked SLM frame offset at.
+enum { x_off_ind, y_off_ind, z_off_ind, num_offs };
+
+// Offsets of sub-arrays
+enum { kernel0_base = x_off_ind, kernel1_base = num_offs };
+
+#define STORE_SLM_OFF(ID, off)                                                 \
+  if (local_id == 0) {                                                         \
+    scalar_store(out, base + ID##_off_ind, off);                               \
+  }
+
+#ifdef FORCE_INLINE
+constexpr bool force_inline = true;
+#define INLINE_CTL inline __attribute__((always_inline))
+#else
+constexpr bool force_inline = false;
+#define INLINE_CTL __attribute__((noinline))
+#endif // FORCE_INLINE
+
+INLINE_CTL void bar(int local_id, T *out, unsigned base) {
+  slm_allocator<SLM_Z> a;
+  unsigned z_off = a.get_offset();
+  STORE_SLM_OFF(z, z_off);
+}
+
+INLINE_CTL void foo(int local_id, T *out, unsigned base) {
+  slm_allocator<SLM_X> a;
+  unsigned x_off = a.get_offset();
+  STORE_SLM_OFF(x, x_off);
+  bar(local_id, out, base);
+  {
+    slm_allocator<SLM_Y> b;
+    unsigned y_off = b.get_offset();
+    STORE_SLM_OFF(y, y_off);
+  }
+}
+
+int main(void) {
+  queue q;
+  auto dev = q.get_device();
+  std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+  std::cout << "force_inline=" << force_inline << "\n";
+  auto ctxt = q.get_context();
+
+  constexpr int num_kernels = 2;
+  T *arr = malloc_shared<T>(num_kernels * num_offs, dev, ctxt);
+  std::memset(arr, 0, num_kernels * num_offs * sizeof(T));
+
+  auto e = q.submit([&](handler &cgh) {
+    cgh.parallel_for<class Kernel0>(nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE),
+                                    [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+                                      slm_init(SLM_N1);
+                                      int local_id = ndi.get_local_linear_id();
+                                      foo(local_id, arr, kernel0_base);
+                                      bar(local_id, arr, kernel0_base);
+                                    });
+  });
+  e.wait();
+
+  e = q.submit([&](handler &cgh) {
+    cgh.parallel_for<class Kernel2>(nd_range<1>(GLOBAL_SIZE, LOCAL_SIZE),
+                                    [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+                                      slm_init(SLM_N2);
+                                      int local_id = ndi.get_local_linear_id();
+                                      bar(local_id, arr, kernel1_base);
+                                    });
+  });
+  e.wait();
+
+  T gold_arr[num_kernels * num_offs];
+  T *gold_arr0 = &gold_arr[kernel0_base];
+  T *gold_arr1 = &gold_arr[kernel1_base];
+
+  // For kernel0 inline/no-inline results are the same for X and Y:
+  // X  - N1
+  // Y  - N1 + X
+  // Z  - max(N1 + X, N2)
+  gold_arr0[x_off_ind] = SLM_N1;
+  gold_arr0[y_off_ind] = SLM_N1 + SLM_X;
+
+  // For kernel0 inline/no-inline case splits for Z:
+#ifdef FORCE_INLINE
+  gold_arr0[z_off_ind] = SLM_N1;
+#else
+  gold_arr0[z_off_ind] = std::max(SLM_N1 + SLM_X, SLM_N2);
+#endif // FORCE_INLINE
+
+  // For kernel1 inline/no-inline results are the same for X and Y:
+  // X  - 0
+  // Y  - 0
+  gold_arr1[x_off_ind] = 0;
+  gold_arr1[y_off_ind] = 0;
+
+  // For kernel1 inline/no-inline case splits for Z:
+#ifdef FORCE_INLINE
+  gold_arr1[z_off_ind] = SLM_N2;
+#else
+  gold_arr1[z_off_ind] = std::max(SLM_N1 + SLM_X, SLM_N2);
+#endif // FORCE_INLINE
+
+  T *test_arr = arr;
+  int err_cnt = 0;
+
+  T kernel_bases[num_kernels] = {kernel0_base, kernel1_base};
+
+  for (int k = 0; k < num_kernels; k++) {
+    std::cout << "Kernel " << k << "\n";
+
+    for (int i = 0; i < num_offs; i++) {
+      T test = test_arr[kernel_bases[k] + i];
+      T gold = gold_arr[kernel_bases[k] + i];
+
+      if (test != gold) {
+        ++err_cnt;
+        std::cerr << "  *** ERROR at [" << i << "]: " << test << " != " << gold
+                  << "(gold)\n";
+      } else {
+        std::cout << "  [" << i << "]: " << test << " == " << gold
+                  << "(gold)\n";
+      }
+    }
+  }
+  std::cout << (err_cnt ? "FAILED\n" : "Passed\n");
+  return err_cnt ? 1 : 0;
+}
diff --git a/SYCL/ESIMD/slm_alloc_many_kernels_one_func.cpp b/SYCL/ESIMD/slm_alloc_many_kernels_one_func.cpp