Skip to content

[SYCL][Graph] Support sycl_ext_oneapi_raw_kernel_arg #15252

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ compile time using a template parameter. This underlying type can be an
accessor, a pointer to a USM allocation, scalar passed by value, or a raw byte
representation of the argument. The raw byte representation is intended to
enable updating arguments set using
link:../proposed/sycl_ext_oneapi_raw_kernel_arg.asciidoc[sycl_ext_oneapi_raw_kernel_arg].
link:../experimental/sycl_ext_oneapi_raw_kernel_arg.asciidoc[sycl_ext_oneapi_raw_kernel_arg].

Dynamic parameters are registered with nodes in a modifiable graph, with each
registration associating one or more node arguments to the dynamic parameter
Expand Down
8 changes: 7 additions & 1 deletion sycl/include/sycl/ext/oneapi/experimental/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ enum class graph_state {
executable, ///< In executable state, the graph is ready to execute.
};

// Forward declare Graph class
// Forward declare ext::oneapi::experimental classes
template <graph_state State> class command_graph;
class raw_kernel_arg;

namespace detail {
// List of sycl features and extensions which are not supported by graphs. Used
Expand Down Expand Up @@ -441,6 +442,11 @@ class __SYCL_EXPORT dynamic_parameter_base {
protected:
void updateValue(const void *NewValue, size_t Size);

// Update a sycl_ext_oneapi_raw_kernel_arg parameter. Size parameter is
// ignored as it represents sizeof(raw_kernel_arg), which doesn't represent
// the number of underlying bytes.
void updateValue(const raw_kernel_arg *NewRawValue, size_t Size);

void updateAccessor(const sycl::detail::AccessorBaseHost *Acc);
std::shared_ptr<dynamic_parameter_impl> impl;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class raw_kernel_arg {
size_t MArgSize;

friend class sycl::handler;
// For sycl_ext_oneapi_graph integration
friend class detail::dynamic_parameter_impl;
};

} // namespace ext::oneapi::experimental
Expand Down
5 changes: 5 additions & 0 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1714,6 +1714,11 @@ void dynamic_parameter_base::updateValue(const void *NewValue, size_t Size) {
impl->updateValue(NewValue, Size);
}

void dynamic_parameter_base::updateValue(const raw_kernel_arg *NewRawValue,
size_t Size) {
impl->updateValue(NewRawValue, Size);
}

void dynamic_parameter_base::updateAccessor(
const sycl::detail::AccessorBaseHost *Acc) {
impl->updateAccessor(Acc);
Expand Down
31 changes: 31 additions & 0 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <sycl/detail/cg_types.hpp>
#include <sycl/detail/os_util.hpp>
#include <sycl/ext/oneapi/experimental/graph.hpp>
#include <sycl/ext/oneapi/experimental/raw_kernel_arg.hpp>
#include <sycl/handler.hpp>

#include <detail/accessor_impl.hpp>
Expand Down Expand Up @@ -1500,6 +1501,17 @@ class dynamic_parameter_impl {
std::memcpy(MValueStorage.data(), Data, ParamSize);
}

/// sycl_ext_oneapi_raw_kernel_arg constructor
/// Parameter size is taken from member of raw_kernel_arg object.
dynamic_parameter_impl(std::shared_ptr<graph_impl> GraphImpl, size_t,
raw_kernel_arg *Data)
: MGraph(GraphImpl) {
size_t RawArgSize = Data->MArgSize;
const void *RawArgData = Data->MArgData;
MValueStorage.reserve(RawArgSize);
std::memcpy(MValueStorage.data(), RawArgData, RawArgSize);
}

/// Register a node with this dynamic parameter
/// @param NodeImpl The node to be registered
/// @param ArgIndex The arg index for the kernel arg associated with this
Expand All @@ -1511,6 +1523,25 @@ class dynamic_parameter_impl {
/// Get a pointer to the internal value of this dynamic parameter
void *getValue() { return MValueStorage.data(); }

/// Update sycl_ext_oneapi_raw_kernel_arg parameter
/// @param NewRawValue Pointer to a raw_kernel_arg object.
/// @param Size Parameter is ignored.
void updateValue(const raw_kernel_arg *NewRawValue, size_t Size) {
// Number of bytes is taken from member of raw_kernel_arg object rather
// than using the size parameter which represents sizeof(raw_kernel_arg).
std::ignore = Size;
size_t RawArgSize = NewRawValue->MArgSize;
const void *RawArgData = NewRawValue->MArgData;

for (auto &[NodeWeak, ArgIndex] : MNodes) {
auto NodeShared = NodeWeak.lock();
if (NodeShared) {
NodeShared->updateArgValue(ArgIndex, RawArgData, RawArgSize);
}
}
std::memcpy(MValueStorage.data(), RawArgData, RawArgSize);
}

/// Update the internal value of this dynamic parameter as well as the value
/// of this parameter in all registered nodes.
/// @param NewValue Pointer to the new value
Expand Down
12 changes: 12 additions & 0 deletions sycl/test-e2e/Graph/Explicit/raw_kernel_arg.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

// REQUIRES: ocloc && level_zero

#define GRAPH_E2E_EXPLICIT

#include "../Inputs/raw_kernel_arg.cpp"
49 changes: 49 additions & 0 deletions sycl/test-e2e/Graph/Inputs/raw_kernel_arg.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Tests using a raw_kernel_arg with 32-bit sized scalars.

#include "../graph_common.hpp"

auto constexpr CLSource = R"===(
__kernel void RawArgKernel(int scalar, __global int *out) {
size_t id = get_global_id(0);
out[id] = id + scalar;
}
)===";

int main() {
queue Queue{};

auto SourceKB =
sycl::ext::oneapi::experimental::create_kernel_bundle_from_source(
Queue.get_context(),
sycl::ext::oneapi::experimental::source_language::opencl, CLSource);
auto ExecKB = sycl::ext::oneapi::experimental::build(SourceKB);

exp_ext::command_graph Graph{Queue};

int32_t *Ptr = malloc_device<int32_t>(Size, Queue);
Queue.memset(Ptr, 0, Size * sizeof(int32_t)).wait();

int32_t Scalar = 42;
exp_ext::raw_kernel_arg RawScalar(&Scalar, sizeof(int32_t));

auto KernelNode = add_node(Graph, Queue, [&](handler &cgh) {
cgh.set_arg(0, RawScalar);
cgh.set_arg(1, Ptr);
cgh.parallel_for(sycl::range<1>{Size},
ExecKB.ext_oneapi_get_kernel("RawArgKernel"));
});

auto ExecGraph = Graph.finalize();

// Ptr should be filled with values based on Scalar
Queue.ext_oneapi_graph(ExecGraph).wait();

std::vector<int32_t> HostData(Size);
Queue.copy(Ptr, HostData.data(), Size).wait();

for (size_t i = 0; i < Size; i++) {
assert(HostData[i] == (i + Scalar));
}

return 0;
}
78 changes: 78 additions & 0 deletions sycl/test-e2e/Graph/Inputs/whole_update_raw_arg.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Tests whole graph update with raw argument extensions

#include "../graph_common.hpp"

void SubmitKernelNode(
exp_ext::command_graph<exp_ext::graph_state::modifiable> Graph, queue Queue,
int32_t *Ptr, exp_ext::raw_kernel_arg &RawArg) {

auto constexpr CLSource = R"===(
__kernel void RawArgKernel(int scalar, __global int *out) {
size_t id = get_global_id(0);
out[id] = id + scalar;
}
)===";

auto SourceKB =
sycl::ext::oneapi::experimental::create_kernel_bundle_from_source(
Queue.get_context(),
sycl::ext::oneapi::experimental::source_language::opencl, CLSource);
auto ExecKB = sycl::ext::oneapi::experimental::build(SourceKB);

add_node(Graph, Queue, [&](handler &cgh) {
cgh.set_arg(0, RawArg);
cgh.set_arg(1, Ptr);
cgh.parallel_for(sycl::range<1>{Size},
ExecKB.ext_oneapi_get_kernel("RawArgKernel"));
});
}

int main() {
queue Queue{};

exp_ext::command_graph GraphA{Queue};

const size_t N = 1024;
int32_t *PtrA = malloc_device<int32_t>(N, Queue);
Queue.memset(PtrA, 0, N * sizeof(int32_t)).wait();

int32_t ScalarA = 42;
sycl::ext::oneapi::experimental::raw_kernel_arg RawScalarA(&ScalarA,
sizeof(int32_t));

SubmitKernelNode(GraphA, Queue, PtrA, RawScalarA);
auto ExecGraphA = GraphA.finalize(exp_ext::property::graph::updatable{});

// PtrA should be filled with values based on ScalarA
Queue.ext_oneapi_graph(ExecGraphA).wait();

std::vector<int32_t> HostDataA(N);
Queue.copy(PtrA, HostDataA.data(), N).wait();
for (size_t i = 0; i < N; i++) {
assert(HostDataA[i] == (i + ScalarA));
}

exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()};

int32_t *PtrB = malloc_device<int32_t>(N, Queue);
Queue.memset(PtrB, 0, N * sizeof(int32_t)).wait();

int32_t ScalarB = 0xA;
sycl::ext::oneapi::experimental::raw_kernel_arg RawScalarB(&ScalarB,
sizeof(int32_t));

// Swap ScalarB and PtrB to be the new inputs/outputs
SubmitKernelNode(GraphB, Queue, PtrB, RawScalarB);
ExecGraphA.update(GraphB);
Queue.ext_oneapi_graph(ExecGraphA).wait();

std::vector<int32_t> HostDataB(N);
Queue.copy(PtrA, HostDataA.data(), N);
Queue.copy(PtrB, HostDataB.data(), N);
Queue.wait();
for (size_t i = 0; i < N; i++) {
assert(HostDataA[i] == (i + ScalarA));
assert(HostDataB[i] == (i + ScalarB));
}
return 0;
}
12 changes: 12 additions & 0 deletions sycl/test-e2e/Graph/RecordReplay/raw_kernel_arg.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

// REQUIRES: ocloc && level_zero

#define GRAPH_E2E_RECORD_REPLAY

#include "../Inputs/raw_kernel_arg.cpp"
12 changes: 12 additions & 0 deletions sycl/test-e2e/Graph/Update/Explicit/whole_update_raw_arg.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

// REQUIRES: ocloc && level_zero

#define GRAPH_E2E_EXPLICIT

#include "../../Inputs/whole_update_raw_arg.cpp"
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

// REQUIRES: ocloc && level_zero

#define GRAPH_E2E_RECORD_REPLAY

#include "../../Inputs/whole_update_raw_arg.cpp"
84 changes: 84 additions & 0 deletions sycl/test-e2e/Graph/Update/update_with_raw_kernel_arg.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

// REQUIRES: ocloc && level_zero

// Tests updating a raw_kernel_arg with 32-bit sized scalars.

#include "../graph_common.hpp"

auto constexpr CLSource = R"===(
__kernel void RawArgKernel(int scalar, __global int *out) {
size_t id = get_global_id(0);
out[id] = id + scalar;
}
)===";

int main() {
queue Queue{};

auto SourceKB =
sycl::ext::oneapi::experimental::create_kernel_bundle_from_source(
Queue.get_context(),
sycl::ext::oneapi::experimental::source_language::opencl, CLSource);
auto ExecKB = sycl::ext::oneapi::experimental::build(SourceKB);

exp_ext::command_graph Graph{Queue};

const size_t N = 1024;
int32_t *PtrA = malloc_device<int32_t>(N, Queue);
int32_t *PtrB = malloc_device<int32_t>(N, Queue);
Queue.memset(PtrA, 0, N * sizeof(int32_t));
Queue.memset(PtrB, 0, N * sizeof(int32_t));
Queue.wait();

int32_t ScalarA = 42;
exp_ext::raw_kernel_arg RawScalarA(&ScalarA, sizeof(int32_t));

int32_t ScalarB = 0xA;
exp_ext::raw_kernel_arg RawScalarB(&ScalarB, sizeof(int32_t));

exp_ext::dynamic_parameter PtrParam(Graph, PtrA);
exp_ext::dynamic_parameter ScalarParam(Graph, RawScalarA);

auto KernelNode = Graph.add([&](handler &cgh) {
cgh.set_arg(0, ScalarParam);
cgh.set_arg(1, PtrParam);
cgh.parallel_for(sycl::range<1>{Size},
ExecKB.ext_oneapi_get_kernel("RawArgKernel"));
});

auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});

// PtrA should be filled with values based on ScalarA
Queue.ext_oneapi_graph(ExecGraph).wait();

std::vector<int> HostDataA(N);
std::vector<int> HostDataB(N);

Queue.copy(PtrA, HostDataA.data(), N);
Queue.copy(PtrB, HostDataB.data(), N);
Queue.wait();
for (size_t i = 0; i < N; i++) {
assert(HostDataA[i] == (i + ScalarA));
assert(HostDataB[i] == 0);
}

// Swap ScalarB and PtrB to be the new inputs/outputs
PtrParam.update(PtrB);
ScalarParam.update(RawScalarB);
ExecGraph.update(KernelNode);
Queue.ext_oneapi_graph(ExecGraph).wait();

Queue.copy(PtrA, HostDataA.data(), N).wait();
Queue.copy(PtrB, HostDataB.data(), N).wait();
for (size_t i = 0; i < N; i++) {
assert(HostDataA[i] == (i + ScalarA));
assert(HostDataB[i] == (i + ScalarB));
}
return 0;
}
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3062,6 +3062,7 @@ _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC2ERKNS3_16image_des
_ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD1Ev
_ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD2Ev
_ZN4sycl3_V13ext6oneapi12experimental6detail17build_from_sourceERNS0_13kernel_bundleILNS0_12bundle_stateE3EEERKSt6vectorINS0_6deviceESaISA_EERKS9_INS0_6detail11string_viewESaISG_EEPNSF_6stringESK_
_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base11updateValueEPKNS3_14raw_kernel_argEm
_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base11updateValueEPKvm
_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base14updateAccessorEPKNS0_6detail16AccessorBaseHostE
_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC1ENS3_13command_graphILNS3_11graph_stateE0EEEmPKv
Expand Down
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_windows.dump
Original file line number Diff line number Diff line change
Expand Up @@ -4265,6 +4265,7 @@
?update@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBVnode@34567@@Z
?updateAccessor@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEBVAccessorBaseHost@267@@Z
?updateValue@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEBX_K@Z
?updateValue@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEBVraw_kernel_arg@34567@_K@Z
?use_kernel_bundle@handler@_V1@sycl@@QEAAXAEBV?$kernel_bundle@$01@23@@Z
?verifyDeviceHasProgressGuarantee@handler@_V1@sycl@@AEAAXW4forward_progress_guarantee@experimental@oneapi@ext@23@W4execution_scope@56723@1@Z
?verifyUsedKernelBundle@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
Expand Down
Loading