Skip to content

Commit 0df8a45

Browse files
authored
[SYCL][Graph] Support native CUDA async alloc/free nodes (#19091)
Adds support for using CUDA-Graph async alloc/free nodes in a `sycl_ext_codeplay_native_command` native-command object in a graph. This requires on CUDA 12.9 where child graphs with async alloc/free nodes can be added to a parent graph https://docs.nvidia.com/cuda/cuda-c-programming-guide/#memory-nodes-in-child-graphs
1 parent 6bae737 commit 0df8a45

File tree

3 files changed

+140
-1
lines changed

3 files changed

+140
-1
lines changed

sycl/doc/design/CommandGraph.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,22 @@ The `urCommandBufferAppendUSMPrefetchExp` and
609609
adapter as empty nodes enforcing the node dependencies. As such the
610610
optimization hints are a no-op.
611611

612+
#### Native Command
613+
614+
CUDA child graphs are used to implement the `urCommandBufferAppendNativeCommandExp`
615+
entry-point for `sycl_ext_codeplay_enqueue_native_command` SYCL-Graph support.
616+
The SYCL native-command node exposes a CUDA-Graph object to the user, which is
617+
then added as a child graph of the parent graph from the SYCL-graph. Therefore
618+
any CUDA limitations that apply to the usage of child nodes in a graph, apply
619+
to native-command nodes.
620+
621+
Using CUDA asynchronous allocation/free nodes in child graphs is only supported
622+
[from CUDA 12.9](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#memory-nodes-in-child-graphs).
623+
As a result adding these async alloc & free nodes to the CUDA-Graph handle
624+
given to a user inside a native-command is only supported in DPC++ builds
625+
against CUDA 12.9 and later when the SYCL-RT can take advantage of this CUDA
626+
functionality in the backend.
627+
612628
### HIP
613629

614630
The HIP backend offers a graph management API very similar to CUDA Graph
@@ -639,6 +655,20 @@ The `urCommandBufferAppendUSMPrefetchExp` and
639655
adapter as empty nodes enforcing the node dependencies. As such the
640656
optimization hints are a no-op.
641657

658+
#### Native Command
659+
660+
HIP child graphs are used to implement the `urCommandBufferAppendNativeCommandExp`
661+
entry-point for `sycl_ext_codeplay_enqueue_native_command` SYCL-Graph support.
662+
The SYCL native-command node exposes a HIP-Graph object to the user, which is
663+
then added as a child graph of the parent graph from the SYCL-graph. Therefore
664+
any CUDA limitations that apply to the usage of child nodes in a graph, apply
665+
to native-command nodes.
666+
667+
Using HIP-Graph asynchronous allocation/free nodes in child graphs is not
668+
supported, and as a result adding async alloc & free nodes to the native
669+
HIP-Graph handle exposed to the user in a native-command will result in an
670+
exception when the graph is finalized.
671+
642672
### OpenCL
643673

644674
SYCL-Graph is only enabled for an OpenCL backend when the
@@ -679,6 +709,7 @@ adapter where there is matching support for each function in the list.
679709
| urCommandBufferAppendMemBufferFillExp | clCommandFillBufferKHR | Yes |
680710
| urCommandBufferAppendUSMPrefetchExp | | No |
681711
| urCommandBufferAppendUSMAdviseExp | | No |
712+
| urCommandBufferAppendNativeCommandExp| | Yes |
682713
| urEnqueueCommandBufferExp | clEnqueueCommandBufferKHR | Yes |
683714
| | clCommandBarrierWithWaitListKHR | No |
684715
| | clCommandCopyImageKHR | No |
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
// RUN: %{build} -o %t.out %cuda_options
2+
// RUN: %{run} %t.out
3+
// RUN: %if preview-breaking-changes-supported %{ %{build} -fpreview-breaking-changes -o %t2.out %cuda_options %}
4+
// RUN: %if preview-breaking-changes-supported %{ %{run} %t2.out %}
5+
// REQUIRES: target-nvidia, cuda_dev_kit
6+
7+
#include <cuda.h>
8+
#include <sycl/backend.hpp>
9+
#include <sycl/ext/oneapi/experimental/graph.hpp>
10+
#include <sycl/interop_handle.hpp>
11+
#include <sycl/usm.hpp>
12+
13+
namespace exp_ext = sycl::ext::oneapi::experimental;
14+
using namespace sycl;
15+
16+
int main() {
17+
// Test is only expected to pass after CUDA 12.9
18+
// See SYCL-Graph design document on CUDA native-command support
19+
int CudaDriverVersion = 0;
20+
cuDriverGetVersion(&CudaDriverVersion);
21+
if (CudaDriverVersion < 12090) {
22+
return 0;
23+
}
24+
25+
queue Queue;
26+
27+
const size_t Size = 128;
28+
int32_t *PtrX = malloc_device<int32_t>(Size, Queue);
29+
30+
exp_ext::command_graph Graph{Queue};
31+
32+
Graph.begin_recording(Queue);
33+
34+
const int32_t Pattern = 42;
35+
Queue.submit([&](handler &CGH) {
36+
CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) {
37+
if (!IH.ext_codeplay_has_graph()) {
38+
assert(false && "Native Handle should have a graph");
39+
}
40+
// Newly created stream for this node
41+
auto NativeStream = IH.get_native_queue<backend::ext_oneapi_cuda>();
42+
// Graph already created with cuGraphCreate
43+
CUgraph NativeGraph =
44+
IH.ext_codeplay_get_native_graph<backend::ext_oneapi_cuda>();
45+
46+
// Start stream capture
47+
auto Res = cuStreamBeginCaptureToGraph(NativeStream, NativeGraph, nullptr,
48+
nullptr, 0,
49+
CU_STREAM_CAPTURE_MODE_GLOBAL);
50+
assert(Res == CUDA_SUCCESS);
51+
52+
// Add asynchronous malloc node
53+
CUdeviceptr PtrAsync;
54+
Res = cuMemAllocAsync(&PtrAsync, Size * sizeof(int32_t), NativeStream);
55+
assert(Res == CUDA_SUCCESS);
56+
57+
// Fill async allocation
58+
Res = cuMemsetD32Async(PtrAsync, Pattern, Size, NativeStream);
59+
assert(Res == CUDA_SUCCESS);
60+
61+
// Add memcopy node to USM allocation
62+
Res = cuMemcpyAsync((CUdeviceptr)PtrX, PtrAsync, Size * sizeof(int32_t),
63+
NativeStream);
64+
assert(Res == CUDA_SUCCESS);
65+
66+
Res = cuMemFreeAsync(PtrAsync, NativeStream);
67+
assert(Res == CUDA_SUCCESS);
68+
69+
Res = cuStreamEndCapture(NativeStream, &NativeGraph);
70+
assert(Res == CUDA_SUCCESS);
71+
});
72+
});
73+
74+
Graph.end_recording();
75+
76+
auto ExecGraph = Graph.finalize();
77+
Queue.ext_oneapi_graph(ExecGraph).wait();
78+
79+
std::vector<int32_t> HostData(Size);
80+
Queue.copy(PtrX, HostData.data(), Size).wait();
81+
for (size_t i = 0; i < Size; i++) {
82+
assert(Pattern == HostData[i]);
83+
}
84+
85+
free(PtrX, Queue);
86+
87+
return 0;
88+
}

unified-runtime/source/adapters/cuda/command_buffer.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ namespace {
2323
ur_result_t
2424
commandBufferDestroy(ur_exp_command_buffer_handle_t CommandBuffer) try {
2525
// Release the memory allocated to the CudaGraph
26-
UR_CHECK_ERROR(cuGraphDestroy(CommandBuffer->CudaGraph));
26+
if (CommandBuffer->CudaGraph) {
27+
UR_CHECK_ERROR(cuGraphDestroy(CommandBuffer->CudaGraph));
28+
}
2729

2830
// Release the memory allocated to the CudaGraphExec
2931
if (CommandBuffer->CudaGraphExec) {
@@ -1515,9 +1517,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendNativeCommandExp(
15151517

15161518
// Add user defined node to graph as a subgraph
15171519
CUgraphNode GraphNode;
1520+
#if CUDA_VERSION >= 12090
1521+
// CUDA 12.9 required to enable native commands to contain memory nodes
1522+
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/#memory-nodes-in-child-graphs
1523+
CUgraphNodeParams ChildNodeParams{};
1524+
ChildNodeParams.type = CU_GRAPH_NODE_TYPE_GRAPH;
1525+
ChildNodeParams.graph.graph = ChildGraph;
1526+
ChildNodeParams.graph.ownership = CU_GRAPH_CHILD_GRAPH_OWNERSHIP_MOVE;
1527+
UR_CHECK_ERROR(cuGraphAddNode_v2(&GraphNode, hCommandBuffer->CudaGraph,
1528+
DepsList.data(), NULL /* edge data */,
1529+
DepsList.size(), &ChildNodeParams));
1530+
// The handle to the child graph is now owned by the parent and will be
1531+
// destroyed when the parent is destroyed. However, the SYCL-RT will
1532+
// call `urCommandBufferReleaseExp` on the child command-buffer, to
1533+
// avoid destroying the underlying handle, set it to nullptr.
1534+
hChildCommandBuffer->CudaGraph = nullptr;
1535+
#else
15181536
UR_CHECK_ERROR(
15191537
cuGraphAddChildGraphNode(&GraphNode, hCommandBuffer->CudaGraph,
15201538
DepsList.data(), DepsList.size(), ChildGraph));
1539+
#endif
1540+
15211541
auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
15221542
if (pSyncPoint) {
15231543
*pSyncPoint = SyncPoint;

0 commit comments

Comments
 (0)