[SYCL][Graph] Support native CUDA async alloc/free nodes (#19091)

EwanC · web-flow · commit 0df8a4566a40 · 2025-06-26T17:30:20.000+01:00
Adds support for using CUDA-Graph async alloc/free nodes in a `sycl_ext_codeplay_native_command` native-command object in a graph. This requires on CUDA 12.9 where child graphs with async alloc/free nodes can be added to a parent graph https://docs.nvidia.com/cuda/cuda-c-programming-guide/#memory-nodes-in-child-graphs
diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md
@@ -609,6 +609,22 @@ The `urCommandBufferAppendUSMPrefetchExp` and
 adapter as empty nodes enforcing the node dependencies. As such the
 optimization hints are a no-op.
 
+#### Native Command
+
+CUDA child graphs are used to implement the `urCommandBufferAppendNativeCommandExp`
+entry-point for `sycl_ext_codeplay_enqueue_native_command` SYCL-Graph support.
+The SYCL native-command node exposes a CUDA-Graph object to the user, which is
+then added as a child graph of the parent graph from the SYCL-graph. Therefore
+any CUDA limitations that apply to the usage of child nodes in a graph, apply
+to native-command nodes.
+
+Using CUDA asynchronous allocation/free nodes in child graphs is only supported
+[from CUDA 12.9](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#memory-nodes-in-child-graphs).
+As a result adding these async alloc & free nodes to the CUDA-Graph handle
+given to a user inside a native-command is only supported in DPC++ builds
+against CUDA 12.9 and later when the SYCL-RT can take advantage of this CUDA
+functionality in the backend.
+
 ### HIP
 
 The HIP backend offers a graph management API very similar to CUDA Graph
@@ -639,6 +655,20 @@ The `urCommandBufferAppendUSMPrefetchExp` and
 adapter as empty nodes enforcing the node dependencies. As such the
 optimization hints are a no-op.
 
+#### Native Command
+
+HIP child graphs are used to implement the `urCommandBufferAppendNativeCommandExp`
+entry-point for `sycl_ext_codeplay_enqueue_native_command` SYCL-Graph support.
+The SYCL native-command node exposes a HIP-Graph object to the user, which is
+then added as a child graph of the parent graph from the SYCL-graph. Therefore
+any CUDA limitations that apply to the usage of child nodes in a graph, apply
+to native-command nodes.
+
+Using HIP-Graph asynchronous allocation/free nodes in child graphs is not
+supported, and as a result adding async alloc & free nodes to the native
+HIP-Graph handle exposed to the user in a native-command will result in an
+exception when the graph is finalized.
+
 ### OpenCL
 
 SYCL-Graph is only enabled for an OpenCL backend when the
@@ -679,6 +709,7 @@ adapter where there is matching support for each function in the list.
 | urCommandBufferAppendMemBufferFillExp | clCommandFillBufferKHR | Yes |
 | urCommandBufferAppendUSMPrefetchExp |  | No |
 | urCommandBufferAppendUSMAdviseExp |  | No |
+| urCommandBufferAppendNativeCommandExp| | Yes |
 | urEnqueueCommandBufferExp | clEnqueueCommandBufferKHR | Yes |
 |  | clCommandBarrierWithWaitListKHR | No |
 |  | clCommandCopyImageKHR | No |
diff --git a/sycl/test-e2e/Graph/NativeCommand/cuda_record_async_malloc.cpp b/sycl/test-e2e/Graph/NativeCommand/cuda_record_async_malloc.cpp
@@ -0,0 +1,88 @@
+// RUN: %{build} -o %t.out %cuda_options
+// RUN: %{run} %t.out
+// RUN: %if preview-breaking-changes-supported %{ %{build} -fpreview-breaking-changes -o %t2.out %cuda_options %}
+// RUN: %if preview-breaking-changes-supported %{ %{run} %t2.out %}
+// REQUIRES: target-nvidia, cuda_dev_kit
+
+#include <cuda.h>
+#include <sycl/backend.hpp>
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+#include <sycl/interop_handle.hpp>
+#include <sycl/usm.hpp>
+
+namespace exp_ext = sycl::ext::oneapi::experimental;
+using namespace sycl;
+
+int main() {
+  // Test is only expected to pass after CUDA 12.9
+  // See SYCL-Graph design document on CUDA native-command support
+  int CudaDriverVersion = 0;
+  cuDriverGetVersion(&CudaDriverVersion);
+  if (CudaDriverVersion < 12090) {
+    return 0;
+  }
+
+  queue Queue;
+
+  const size_t Size = 128;
+  int32_t *PtrX = malloc_device<int32_t>(Size, Queue);
+
+  exp_ext::command_graph Graph{Queue};
+
+  Graph.begin_recording(Queue);
+
+  const int32_t Pattern = 42;
+  Queue.submit([&](handler &CGH) {
+    CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) {
+      if (!IH.ext_codeplay_has_graph()) {
+        assert(false && "Native Handle should have a graph");
+      }
+      // Newly created stream for this node
+      auto NativeStream = IH.get_native_queue<backend::ext_oneapi_cuda>();
+      // Graph already created with cuGraphCreate
+      CUgraph NativeGraph =
+          IH.ext_codeplay_get_native_graph<backend::ext_oneapi_cuda>();
+
+      // Start stream capture
+      auto Res = cuStreamBeginCaptureToGraph(NativeStream, NativeGraph, nullptr,
+                                             nullptr, 0,
+                                             CU_STREAM_CAPTURE_MODE_GLOBAL);
+      assert(Res == CUDA_SUCCESS);
+
+      // Add asynchronous malloc node
+      CUdeviceptr PtrAsync;
+      Res = cuMemAllocAsync(&PtrAsync, Size * sizeof(int32_t), NativeStream);
+      assert(Res == CUDA_SUCCESS);
+
+      // Fill async allocation
+      Res = cuMemsetD32Async(PtrAsync, Pattern, Size, NativeStream);
+      assert(Res == CUDA_SUCCESS);
+
+      // Add memcopy node to USM allocation
+      Res = cuMemcpyAsync((CUdeviceptr)PtrX, PtrAsync, Size * sizeof(int32_t),
+                          NativeStream);
+      assert(Res == CUDA_SUCCESS);
+
+      Res = cuMemFreeAsync(PtrAsync, NativeStream);
+      assert(Res == CUDA_SUCCESS);
+
+      Res = cuStreamEndCapture(NativeStream, &NativeGraph);
+      assert(Res == CUDA_SUCCESS);
+    });
+  });
+
+  Graph.end_recording();
+
+  auto ExecGraph = Graph.finalize();
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  std::vector<int32_t> HostData(Size);
+  Queue.copy(PtrX, HostData.data(), Size).wait();
+  for (size_t i = 0; i < Size; i++) {
+    assert(Pattern == HostData[i]);
+  }
+
+  free(PtrX, Queue);
+
+  return 0;
+}
diff --git a/unified-runtime/source/adapters/cuda/command_buffer.cpp b/unified-runtime/source/adapters/cuda/command_buffer.cpp
@@ -23,7 +23,9 @@ namespace {
 ur_result_t
 commandBufferDestroy(ur_exp_command_buffer_handle_t CommandBuffer) try {
   // Release the memory allocated to the CudaGraph
-  UR_CHECK_ERROR(cuGraphDestroy(CommandBuffer->CudaGraph));
+  if (CommandBuffer->CudaGraph) {
+    UR_CHECK_ERROR(cuGraphDestroy(CommandBuffer->CudaGraph));
+  }
 
   // Release the memory allocated to the CudaGraphExec
   if (CommandBuffer->CudaGraphExec) {
@@ -1515,9 +1517,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendNativeCommandExp(
 
     // Add user defined node to graph as a subgraph
     CUgraphNode GraphNode;
+#if CUDA_VERSION >= 12090
+    // CUDA 12.9 required to enable native commands to contain memory nodes
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#memory-nodes-in-child-graphs
+    CUgraphNodeParams ChildNodeParams{};
+    ChildNodeParams.type = CU_GRAPH_NODE_TYPE_GRAPH;
+    ChildNodeParams.graph.graph = ChildGraph;
+    ChildNodeParams.graph.ownership = CU_GRAPH_CHILD_GRAPH_OWNERSHIP_MOVE;
+    UR_CHECK_ERROR(cuGraphAddNode_v2(&GraphNode, hCommandBuffer->CudaGraph,
+                                     DepsList.data(), NULL /* edge data */,
+                                     DepsList.size(), &ChildNodeParams));
+    // The handle to the child graph is now owned by the parent and will be
+    // destroyed when the parent is destroyed. However, the SYCL-RT will
+    // call `urCommandBufferReleaseExp` on the child command-buffer, to
+    // avoid destroying the underlying handle, set it to nullptr.
+    hChildCommandBuffer->CudaGraph = nullptr;
+#else
     UR_CHECK_ERROR(
         cuGraphAddChildGraphNode(&GraphNode, hCommandBuffer->CudaGraph,
                                  DepsList.data(), DepsList.size(), ChildGraph));
+#endif
+
     auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
     if (pSyncPoint) {
       *pSyncPoint = SyncPoint;