Skip to content

Commit a05ed96

Browse files
authored
[SYCL][Graph] Support work_group_memory extension (#16229)
Document support for using [sycl_ext_oneapi_work_group_memory](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_memory.asciidoc) inside a graph kernel node, with E2E tests for verification. E2E Tests for `work_group_memory` update in executable graphs is has been included in PR #16025 , as it requires a UR fix that is part of that PR.
1 parent 590960a commit a05ed96

File tree

8 files changed

+163
-4
lines changed

8 files changed

+163
-4
lines changed

sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1977,14 +1977,20 @@ Removing this restriction is something we may look at for future revisions of
19771977
The command submission functions defined in
19781978
link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[sycl_ext_oneapi_enqueue_functions]
19791979
can be used adding nodes to a graph when creating a graph from queue recording.
1980-
New methods are also defined that enable submitting an executable graph,
1980+
New methods are also defined that enable submitting an executable graph,
19811981
e.g. directly to a queue without returning an event.
19821982

19831983
==== sycl_ext_oneapi_free_function_kernels
19841984

1985-
`sycl_ext_oneapi_free_function_kernels`, defined in
1985+
`sycl_ext_oneapi_free_function_kernels`, defined in
19861986
link:../proposed/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels]
1987-
can be used with SYCL Graphs.
1987+
can be used with SYCL Graphs.
1988+
1989+
==== sycl_ext_oneapi_work_group_memory
1990+
1991+
Using the `work_group_memory` object defined in
1992+
link:../experimental/sycl_ext_oneapi_work_group_memory.asciidoc[sycl_ext_oneapi_work_group_memory]
1993+
inside graph kernel nodes is supported.
19881994

19891995
==== sycl_ext_oneapi_work_group_scratch_memory
19901996

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
#define GRAPH_E2E_EXPLICIT
9+
10+
#include "../Inputs/work_group_memory.cpp"
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
// XFAIL: cuda
9+
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
10+
11+
#define GRAPH_E2E_EXPLICIT
12+
13+
#include "../Inputs/work_group_memory_free_function.cpp"

sycl/test-e2e/Graph/Inputs/free_function_kernels.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ int main() {
2525
kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(Ctxt);
2626
kernel_id Kernel_id = exp_ext::get_kernel_id<ff_0>();
2727
kernel Kernel = Bundle.get_kernel(Kernel_id);
28-
auto KernelNode = Graph.add([&](handler &cgh) {
28+
auto KernelNode = add_node(Graph, Queue, [&](handler &cgh) {
2929
cgh.set_arg(0, PtrA);
3030
cgh.single_task(Kernel);
3131
});
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// Tests using sycl_ext_oneapi_work_group_memory in a graph node
2+
3+
#include "../graph_common.hpp"
4+
#include <sycl/ext/oneapi/experimental/work_group_memory.hpp>
5+
6+
int main() {
7+
queue Queue;
8+
exp_ext::command_graph Graph{Queue};
9+
10+
std::vector<int> HostData(Size);
11+
std::iota(HostData.begin(), HostData.end(), 10);
12+
13+
int *Ptr = malloc_device<int>(Size, Queue);
14+
Queue.copy(HostData.data(), Ptr, Size).wait();
15+
16+
const size_t LocalSize = 128;
17+
auto node = add_node(Graph, Queue, [&](handler &CGH) {
18+
exp_ext::work_group_memory<int[]> WGMem{LocalSize, CGH};
19+
20+
CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
21+
WGMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2;
22+
Ptr[Item.get_global_linear_id()] += WGMem[Item.get_local_linear_id()];
23+
});
24+
});
25+
26+
auto GraphExec = Graph.finalize();
27+
28+
for (unsigned N = 0; N < Iterations; N++) {
29+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
30+
}
31+
Queue.wait_and_throw();
32+
33+
Queue.copy(Ptr, HostData.data(), Size);
34+
Queue.wait_and_throw();
35+
36+
for (size_t i = 0; i < Size; i++) {
37+
int Ref = 10 + i + (Iterations * (i * 2));
38+
assert(check_value(i, Ref, HostData[i], "Ptr"));
39+
}
40+
41+
free(Ptr, Queue);
42+
return 0;
43+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// Tests using sycl_ext_oneapi_work_group_memory in a graph node with
2+
// free functions
3+
4+
#include "../graph_common.hpp"
5+
#include <sycl/ext/oneapi/experimental/work_group_memory.hpp>
6+
#include <sycl/ext/oneapi/free_function_queries.hpp>
7+
8+
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(exp_ext::nd_range_kernel<1>)
9+
void ff_local_mem(int *Ptr, exp_ext::work_group_memory<int[]> LocalMem) {
10+
const auto WI = sycl::ext::oneapi::this_work_item::get_nd_item<1>();
11+
size_t LocalID = WI.get_local_id();
12+
size_t GlobalID = WI.get_global_id();
13+
14+
LocalMem[LocalID] = GlobalID * 2;
15+
Ptr[GlobalID] += LocalMem[LocalID];
16+
}
17+
18+
int main() {
19+
queue Queue;
20+
exp_ext::command_graph Graph{Queue};
21+
22+
std::vector<int> HostData(Size);
23+
std::iota(HostData.begin(), HostData.end(), 10);
24+
25+
int *Ptr = malloc_device<int>(Size, Queue);
26+
Queue.copy(HostData.data(), Ptr, Size).wait();
27+
28+
const size_t LocalSize = 128;
29+
30+
#ifndef __SYCL_DEVICE_ONLY__
31+
kernel_bundle Bundle =
32+
get_kernel_bundle<bundle_state::executable>(Queue.get_context());
33+
kernel_id Kernel_id = exp_ext::get_kernel_id<ff_local_mem>();
34+
kernel Kernel = Bundle.get_kernel(Kernel_id);
35+
36+
auto node = add_node(Graph, Queue, [&](handler &CGH) {
37+
CGH.set_arg(0, Ptr);
38+
39+
exp_ext::work_group_memory<int[]> WGMem{LocalSize, CGH};
40+
CGH.set_arg(1, WGMem);
41+
42+
nd_range NDRange{{Size}, {LocalSize}};
43+
CGH.parallel_for(NDRange, Kernel);
44+
});
45+
46+
auto GraphExec = Graph.finalize();
47+
48+
for (unsigned N = 0; N < Iterations; N++) {
49+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
50+
}
51+
Queue.wait_and_throw();
52+
53+
Queue.copy(Ptr, HostData.data(), Size);
54+
Queue.wait_and_throw();
55+
56+
for (size_t i = 0; i < Size; i++) {
57+
int Ref = 10 + i + (Iterations * (i * 2));
58+
assert(check_value(i, Ref, HostData[i], "Ptr"));
59+
}
60+
#endif
61+
62+
free(Ptr, Queue);
63+
return 0;
64+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
#define GRAPH_E2E_RECORD_REPLAY
9+
10+
#include "../Inputs/work_group_memory.cpp"
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
// XFAIL: cuda
9+
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
10+
11+
#define GRAPH_E2E_RECORD_REPLAY
12+
13+
#include "../Inputs/work_group_memory_free_function.cpp"

0 commit comments

Comments
 (0)