[SYCL][Graph] Support work_group_memory extension (#16229)

EwanC · web-flow · commit a05ed96f6b21 · 2024-12-05T12:09:17.000Z
Document support for using [sycl_ext_oneapi_work_group_memory](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_memory.asciidoc) inside a graph kernel node, with E2E tests for verification. E2E Tests for `work_group_memory` update in executable graphs is has been included in PR #16025 , as it requires a UR fix that is part of that PR.
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
@@ -1977,14 +1977,20 @@ Removing this restriction is something we may look at for future revisions of
 The command submission functions defined in
 link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[sycl_ext_oneapi_enqueue_functions]
 can be used adding nodes to a graph when creating a graph from queue recording.
-New methods are also defined that enable submitting an executable graph, 
+New methods are also defined that enable submitting an executable graph,
 e.g. directly to a queue without returning an event.
 
 ==== sycl_ext_oneapi_free_function_kernels
 
-`sycl_ext_oneapi_free_function_kernels`, defined in 
+`sycl_ext_oneapi_free_function_kernels`, defined in
 link:../proposed/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels]
-can be used with SYCL Graphs. 
+can be used with SYCL Graphs.
+
+==== sycl_ext_oneapi_work_group_memory
+
+Using the `work_group_memory` object defined in
+link:../experimental/sycl_ext_oneapi_work_group_memory.asciidoc[sycl_ext_oneapi_work_group_memory]
+inside graph kernel nodes is supported.
 
 ==== sycl_ext_oneapi_work_group_scratch_memory
 
diff --git a/sycl/test-e2e/Graph/Explicit/work_group_memory.cpp b/sycl/test-e2e/Graph/Explicit/work_group_memory.cpp
@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/work_group_memory.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/work_group_memory_free_function.cpp b/sycl/test-e2e/Graph/Explicit/work_group_memory_free_function.cpp
@@ -0,0 +1,13 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// XFAIL: cuda
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/work_group_memory_free_function.cpp"
diff --git a/sycl/test-e2e/Graph/Inputs/free_function_kernels.cpp b/sycl/test-e2e/Graph/Inputs/free_function_kernels.cpp
@@ -25,7 +25,7 @@ int main() {
   kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(Ctxt);
   kernel_id Kernel_id = exp_ext::get_kernel_id<ff_0>();
   kernel Kernel = Bundle.get_kernel(Kernel_id);
-  auto KernelNode = Graph.add([&](handler &cgh) {
+  auto KernelNode = add_node(Graph, Queue, [&](handler &cgh) {
     cgh.set_arg(0, PtrA);
     cgh.single_task(Kernel);
   });
diff --git a/sycl/test-e2e/Graph/Inputs/work_group_memory.cpp b/sycl/test-e2e/Graph/Inputs/work_group_memory.cpp
@@ -0,0 +1,43 @@
+// Tests using sycl_ext_oneapi_work_group_memory in a graph node
+
+#include "../graph_common.hpp"
+#include <sycl/ext/oneapi/experimental/work_group_memory.hpp>
+
+int main() {
+  queue Queue;
+  exp_ext::command_graph Graph{Queue};
+
+  std::vector<int> HostData(Size);
+  std::iota(HostData.begin(), HostData.end(), 10);
+
+  int *Ptr = malloc_device<int>(Size, Queue);
+  Queue.copy(HostData.data(), Ptr, Size).wait();
+
+  const size_t LocalSize = 128;
+  auto node = add_node(Graph, Queue, [&](handler &CGH) {
+    exp_ext::work_group_memory<int[]> WGMem{LocalSize, CGH};
+
+    CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
+      WGMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2;
+      Ptr[Item.get_global_linear_id()] += WGMem[Item.get_local_linear_id()];
+    });
+  });
+
+  auto GraphExec = Graph.finalize();
+
+  for (unsigned N = 0; N < Iterations; N++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+  Queue.wait_and_throw();
+
+  Queue.copy(Ptr, HostData.data(), Size);
+  Queue.wait_and_throw();
+
+  for (size_t i = 0; i < Size; i++) {
+    int Ref = 10 + i + (Iterations * (i * 2));
+    assert(check_value(i, Ref, HostData[i], "Ptr"));
+  }
+
+  free(Ptr, Queue);
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/work_group_memory_free_function.cpp b/sycl/test-e2e/Graph/Inputs/work_group_memory_free_function.cpp
@@ -0,0 +1,64 @@
+// Tests using sycl_ext_oneapi_work_group_memory in a graph node with
+// free functions
+
+#include "../graph_common.hpp"
+#include <sycl/ext/oneapi/experimental/work_group_memory.hpp>
+#include <sycl/ext/oneapi/free_function_queries.hpp>
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(exp_ext::nd_range_kernel<1>)
+void ff_local_mem(int *Ptr, exp_ext::work_group_memory<int[]> LocalMem) {
+  const auto WI = sycl::ext::oneapi::this_work_item::get_nd_item<1>();
+  size_t LocalID = WI.get_local_id();
+  size_t GlobalID = WI.get_global_id();
+
+  LocalMem[LocalID] = GlobalID * 2;
+  Ptr[GlobalID] += LocalMem[LocalID];
+}
+
+int main() {
+  queue Queue;
+  exp_ext::command_graph Graph{Queue};
+
+  std::vector<int> HostData(Size);
+  std::iota(HostData.begin(), HostData.end(), 10);
+
+  int *Ptr = malloc_device<int>(Size, Queue);
+  Queue.copy(HostData.data(), Ptr, Size).wait();
+
+  const size_t LocalSize = 128;
+
+#ifndef __SYCL_DEVICE_ONLY__
+  kernel_bundle Bundle =
+      get_kernel_bundle<bundle_state::executable>(Queue.get_context());
+  kernel_id Kernel_id = exp_ext::get_kernel_id<ff_local_mem>();
+  kernel Kernel = Bundle.get_kernel(Kernel_id);
+
+  auto node = add_node(Graph, Queue, [&](handler &CGH) {
+    CGH.set_arg(0, Ptr);
+
+    exp_ext::work_group_memory<int[]> WGMem{LocalSize, CGH};
+    CGH.set_arg(1, WGMem);
+
+    nd_range NDRange{{Size}, {LocalSize}};
+    CGH.parallel_for(NDRange, Kernel);
+  });
+
+  auto GraphExec = Graph.finalize();
+
+  for (unsigned N = 0; N < Iterations; N++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+  Queue.wait_and_throw();
+
+  Queue.copy(Ptr, HostData.data(), Size);
+  Queue.wait_and_throw();
+
+  for (size_t i = 0; i < Size; i++) {
+    int Ref = 10 + i + (Iterations * (i * 2));
+    assert(check_value(i, Ref, HostData[i], "Ptr"));
+  }
+#endif
+
+  free(Ptr, Queue);
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/work_group_memory.cpp b/sycl/test-e2e/Graph/RecordReplay/work_group_memory.cpp
@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/work_group_memory.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/work_group_memory_free_function.cpp b/sycl/test-e2e/Graph/RecordReplay/work_group_memory_free_function.cpp
@@ -0,0 +1,13 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// XFAIL: cuda
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/work_group_memory_free_function.cpp"