Skip to content

[SYCL] Use PI APIs for cooperative kernels #12367

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 35 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
de14fcd
[SYCL] Use PI APIs for cooperative kernels
0x12CC Jan 11, 2024
aa90450
Run clang-format
0x12CC Jan 11, 2024
1f80192
Add missing PI symbols
0x12CC Jan 11, 2024
74c795b
Address review comments
0x12CC Jan 15, 2024
10f73d9
Merge branch 'sycl' into pi_cooperative_kernels
0x12CC Jan 15, 2024
b38f6e6
Update PI to include new query parameters
0x12CC Jan 18, 2024
203b825
Merge branch 'sycl' into pi_cooperative_kernels
0x12CC Jan 18, 2024
26fa27c
Merge branch 'sycl' into pi_cooperative_kernels
0x12CC Jan 19, 2024
5d68ba8
Merge branch 'sycl' into pi_cooperative_kernels
0x12CC Jan 19, 2024
3e0b6f3
Update UR commit
0x12CC Jan 19, 2024
522c824
Update UR tag
0x12CC Jan 24, 2024
f2b4dd9
Merge branch 'sycl' into pi_cooperative_kernels
0x12CC Jan 24, 2024
3a28dac
Update UR tag
0x12CC Jan 25, 2024
ac39404
Merge branch 'sycl' into pi_cooperative_kernels
0x12CC Jan 25, 2024
ec0a709
Merge branch 'sycl' into pi_cooperative_kernels
0x12CC Jan 26, 2024
3c2f1cd
Update PI minor version
0x12CC Feb 13, 2024
4959a75
Merge branch 'sycl' into pi_cooperative_kernels
0x12CC Feb 13, 2024
07075d7
Update UR commit
0x12CC Feb 13, 2024
417ad60
Move cooperative check to `isFusable`
0x12CC Feb 14, 2024
fb494d5
Merge branch 'sycl' into pi_cooperative_kernels
0x12CC Feb 14, 2024
95363c5
Add branch to root group barrier
0x12CC Feb 14, 2024
1b0e4ba
Use `static_cast` in `isFusable`
0x12CC Feb 14, 2024
f2dfb69
Update warning message and add test case for non-fusable cooperative …
0x12CC Feb 14, 2024
9d59da1
Disable `use_root_sync` on HIP
0x12CC Feb 15, 2024
559f4d3
Merge branch 'sycl' into pi_cooperative_kernels
0x12CC Feb 15, 2024
b6471e9
Re-enable `use_root_sync` on HIP
0x12CC Feb 15, 2024
a80234b
Move `isFusable` test case for cooperative kernels
0x12CC Feb 15, 2024
5d21e3e
Merge remote-tracking branch 'origin/sycl' into pi_cooperative_kernels
kbenzie Feb 19, 2024
0dbc792
[UR] Bump tag to 3fd11f1d
kbenzie Feb 19, 2024
dabedb4
Merge remote-tracking branch 'origin/sycl' into pi_cooperative_kernels
kbenzie Feb 20, 2024
6136994
Remove `else` after `return`
0x12CC Feb 20, 2024
a34f065
Update sycl/include/sycl/detail/pi.h
0x12CC Feb 21, 2024
fff4a88
Update barrier comment
0x12CC Feb 21, 2024
dfe9281
Merge branch 'sycl' into pi_cooperative_kernels
0x12CC Feb 21, 2024
275d2bd
Merge branch 'sycl' into pi_cooperative_kernels
steffenlarsen Feb 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions sycl/include/sycl/detail/cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ class CG {
CG(CG &&CommandGroup) = default;
CG(const CG &CommandGroup) = default;

CGTYPE getType() { return MType; }
CGTYPE getType() const { return MType; }

std::vector<std::vector<char>> &getArgsStorage() {
return MData.MArgsStorage;
Expand Down Expand Up @@ -176,6 +176,7 @@ class CGExecKernel : public CG {
std::vector<std::shared_ptr<detail::stream_impl>> MStreams;
std::vector<std::shared_ptr<const void>> MAuxiliaryResources;
sycl::detail::pi::PiKernelCacheConfig MKernelCacheConfig;
bool MKernelIsCooperative = false;

CGExecKernel(NDRDescT NDRDesc, std::shared_ptr<HostKernelBase> HKernel,
std::shared_ptr<detail::kernel_impl> SyclKernel,
Expand All @@ -186,14 +187,15 @@ class CGExecKernel : public CG {
std::vector<std::shared_ptr<const void>> AuxiliaryResources,
CGTYPE Type,
sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
detail::code_location loc = {})
bool KernelIsCooperative, detail::code_location loc = {})
: CG(Type, std::move(CGData), std::move(loc)),
MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
MSyclKernel(std::move(SyclKernel)),
MKernelBundle(std::move(KernelBundle)), MArgs(std::move(Args)),
MKernelName(std::move(KernelName)), MStreams(std::move(Streams)),
MAuxiliaryResources(std::move(AuxiliaryResources)),
MKernelCacheConfig(std::move(KernelCacheConfig)) {
MKernelCacheConfig(std::move(KernelCacheConfig)),
MKernelIsCooperative(KernelIsCooperative) {
assert(getType() == Kernel && "Wrong type of exec kernel CG.");
}

Expand Down
2 changes: 2 additions & 0 deletions sycl/include/sycl/detail/pi.def
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ _PI_API(piextKernelSetArgPointer)
_PI_API(piKernelSetExecInfo)
_PI_API(piextKernelCreateWithNativeHandle)
_PI_API(piextKernelGetNativeHandle)
_PI_API(piextKernelSuggestMaxCooperativeGroupCount)
// Event
_PI_API(piEventCreate)
_PI_API(piEventGetInfo)
Expand All @@ -105,6 +106,7 @@ _PI_API(piSamplerRetain)
_PI_API(piSamplerRelease)
// Queue commands
_PI_API(piEnqueueKernelLaunch)
_PI_API(piextEnqueueCooperativeKernelLaunch)
_PI_API(piEnqueueEventsWait)
_PI_API(piEnqueueEventsWaitWithBarrier)
_PI_API(piEnqueueMemBufferRead)
Expand Down
22 changes: 21 additions & 1 deletion sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,11 @@
// 15.43 Changed the signature of piextMemGetNativeHandle to also take a
// pi_device
// 15.44 Add coarse-grain memory advice flag for HIP.
// 15.45 Added piextKernelSuggestMaxCooperativeGroupCount and
// piextEnqueueCooperativeKernelLaunch.

#define _PI_H_VERSION_MAJOR 15
#define _PI_H_VERSION_MINOR 44
#define _PI_H_VERSION_MINOR 45

#define _PI_STRING_HELPER(a) #a
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
Expand Down Expand Up @@ -1670,6 +1672,18 @@ __SYCL_EXPORT pi_result piextKernelCreateWithNativeHandle(
__SYCL_EXPORT pi_result
piextKernelGetNativeHandle(pi_kernel kernel, pi_native_handle *nativeHandle);

/// Gets the max work group count for a cooperative kernel.
///
/// \param kernel is the PI kernel being queried.
/// \param local_work_size is the number of work items in a work group that will
/// be used when the kernel is launched. \param dynamic_shared_memory_size is
/// the size of dynamic shared memory, for each work group, in bytes, that will
/// be used when the kernel is launched." \param group_count_ret is a pointer to
/// where the query result will be stored.
__SYCL_EXPORT pi_result piextKernelSuggestMaxCooperativeGroupCount(
pi_kernel kernel, size_t local_work_size, size_t dynamic_shared_memory_size,
pi_uint32 *group_count_ret);

//
// Events
//
Expand Down Expand Up @@ -1752,6 +1766,12 @@ __SYCL_EXPORT pi_result piEnqueueKernelLaunch(
const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
const pi_event *event_wait_list, pi_event *event);

__SYCL_EXPORT pi_result piextEnqueueCooperativeKernelLaunch(
pi_queue queue, pi_kernel kernel, pi_uint32 work_dim,
const size_t *global_work_offset, const size_t *global_work_size,
const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
const pi_event *event_wait_list, pi_event *event);

__SYCL_EXPORT pi_result piEnqueueEventsWait(pi_queue command_queue,
pi_uint32 num_events_in_wait_list,
const pi_event *event_wait_list,
Expand Down
6 changes: 6 additions & 0 deletions sycl/include/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ struct sub_group;
namespace experimental {
template <typename ParentGroup> class ballot_group;
template <size_t PartitionSize, typename ParentGroup> class fixed_size_group;
template <int Dimensions> class root_group;
template <typename ParentGroup> class tangle_group;
class opportunistic_group;
} // namespace experimental
Expand Down Expand Up @@ -51,6 +52,11 @@ namespace spirv {

template <typename Group> struct group_scope {};

template <int Dimensions>
struct group_scope<sycl::ext::oneapi::experimental::root_group<Dimensions>> {
static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Device;
};

template <int Dimensions> struct group_scope<group<Dimensions>> {
static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Workgroup;
};
Expand Down
6 changes: 3 additions & 3 deletions sycl/include/sycl/detail/type_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ template <class T>
inline constexpr bool is_fixed_topology_group_v =
is_fixed_topology_group<T>::value;

#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
template <> struct is_fixed_topology_group<root_group> : std::true_type {};
#endif
template <int Dimensions> class root_group;
template <int Dimensions>
struct is_fixed_topology_group<root_group<Dimensions>> : std::true_type {};

template <int Dimensions>
struct is_fixed_topology_group<sycl::group<Dimensions>> : std::true_type {};
Expand Down
40 changes: 17 additions & 23 deletions sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@

#include <sycl/builtins.hpp>
#include <sycl/ext/oneapi/properties/properties.hpp>
#include <sycl/group.hpp>
#include <sycl/memory_enums.hpp>
#include <sycl/queue.hpp>

#define SYCL_EXT_ONEAPI_ROOT_GROUP 1
#include <sycl/nd_item.hpp>
#include <sycl/sub_group.hpp>

namespace sycl {
inline namespace _V1 {
Expand Down Expand Up @@ -106,31 +106,25 @@ template <int Dimensions> root_group<Dimensions> get_root_group() {

} // namespace ext::oneapi::experimental

template <>
typename ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync::return_type
kernel::ext_oneapi_get_info<
ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(const queue &q) const {
// TODO: query the backend to return a value >= 1.
return 1;
}

template <int dimensions>
void group_barrier(ext::oneapi::experimental::root_group<dimensions> G,
memory_scope FenceScope = decltype(G)::fence_scope) {
(void)G;
(void)FenceScope;
#ifdef __SYCL_DEVICE_ONLY__
// TODO: Change __spv::Scope::Workgroup to __spv::Scope::Device once backends
// support device scope. __spv::Scope::Workgroup is only valid when
// max_num_work_group_sync is 1, so that all work items in a root group will
// also be in the same work group.
__spirv_ControlBarrier(__spv::Scope::Workgroup, __spv::Scope::Workgroup,
__spv::MemorySemanticsMask::SubgroupMemory |
__spv::MemorySemanticsMask::WorkgroupMemory |
__spv::MemorySemanticsMask::CrossWorkgroupMemory);
// Root group barrier synchronizes using a work group barrier if there's only
// one work group. This allows backends to ignore the ControlBarrier with
// Device scope if their maximum number of work groups is 1. This is a
// workaround that's not intended to reduce the bar for SPIR-V modules
// acceptance, but rather make a pessimistic case work until we have full
// support for the device barrier built-in from backends.
const auto ChildGroup = ext::oneapi::experimental::get_child_group(G);
if (ChildGroup.get_group_linear_range() == 1) {
group_barrier(ChildGroup);
} else {
detail::spirv::ControlBarrier(G, FenceScope, memory_order::seq_cst);
}
#else
(void)G;
(void)FenceScope;
throw sycl::runtime_error("Barriers are not supported on host device",
PI_ERROR_INVALID_DEVICE);
#endif
Expand Down
7 changes: 7 additions & 0 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <sycl/ext/oneapi/device_global/device_global.hpp>
#include <sycl/ext/oneapi/device_global/properties.hpp>
#include <sycl/ext/oneapi/experimental/graph.hpp>
#include <sycl/ext/oneapi/experimental/root_group.hpp>
#include <sycl/ext/oneapi/kernel_properties/properties.hpp>
#include <sycl/ext/oneapi/properties/properties.hpp>
#include <sycl/group.hpp>
Expand Down Expand Up @@ -933,6 +934,10 @@ class __SYCL_EXPORT handler {
} else {
std::ignore = Props;
}

constexpr bool UsesRootSync = PropertiesT::template has_property<
sycl::ext::oneapi::experimental::use_root_sync_key>();
setKernelIsCooperative(UsesRootSync);
}

/// Checks whether it is possible to copy the source shape to the destination
Expand Down Expand Up @@ -3622,6 +3627,8 @@ class __SYCL_EXPORT handler {

// Set value of the gpu cache configuration for the kernel.
void setKernelCacheConfig(sycl::detail::pi::PiKernelCacheConfig);
// Set value of the kernel is cooperative flag
void setKernelIsCooperative(bool);

template <
ext::oneapi::experimental::detail::UnsupportedGraphFeatures FeatureT>
Expand Down
1 change: 1 addition & 0 deletions sycl/include/sycl/sycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
#include <sycl/ext/oneapi/experimental/fixed_size_group.hpp>
#include <sycl/ext/oneapi/experimental/opportunistic_group.hpp>
#include <sycl/ext/oneapi/experimental/prefetch.hpp>
#include <sycl/ext/oneapi/experimental/root_group.hpp>
#include <sycl/ext/oneapi/experimental/tangle_group.hpp>
#include <sycl/ext/oneapi/filter_selector.hpp>
#include <sycl/ext/oneapi/functional.hpp>
Expand Down
17 changes: 17 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextEnqueueCooperativeKernelLaunch(
pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *OutEvent) {
return pi2ur::piextEnqueueCooperativeKernelLaunch(
Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
pi_context Context,
pi_program Program,
Expand All @@ -559,6 +569,13 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
}

pi_result piextKernelSuggestMaxCooperativeGroupCount(
pi_kernel Kernel, size_t LocalWorkSize, size_t DynamicSharedMemorySize,
pi_uint32 *GroupCountRet) {
return pi2ur::piextKernelSuggestMaxCooperativeGroupCount(
Kernel, LocalWorkSize, DynamicSharedMemorySize, GroupCountRet);
}

pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
return pi2ur::piEventCreate(Context, RetEvent);
}
Expand Down
17 changes: 17 additions & 0 deletions sycl/plugins/hip/pi_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextEnqueueCooperativeKernelLaunch(
pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *OutEvent) {
return pi2ur::piextEnqueueCooperativeKernelLaunch(
Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
pi_context Context,
pi_program Program,
Expand All @@ -562,6 +572,13 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
}

pi_result piextKernelSuggestMaxCooperativeGroupCount(
pi_kernel Kernel, size_t LocalWorkSize, size_t DynamicSharedMemorySize,
pi_uint32 *GroupCountRet) {
return pi2ur::piextKernelSuggestMaxCooperativeGroupCount(
Kernel, LocalWorkSize, DynamicSharedMemorySize, GroupCountRet);
}

pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
return pi2ur::piEventCreate(Context, RetEvent);
}
Expand Down
17 changes: 17 additions & 0 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextEnqueueCooperativeKernelLaunch(
pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *OutEvent) {
return pi2ur::piextEnqueueCooperativeKernelLaunch(
Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
pi_context Context,
pi_program Program,
Expand All @@ -573,6 +583,13 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
}

pi_result piextKernelSuggestMaxCooperativeGroupCount(
pi_kernel Kernel, size_t LocalWorkSize, size_t DynamicSharedMemorySize,
pi_uint32 *GroupCountRet) {
return pi2ur::piextKernelSuggestMaxCooperativeGroupCount(
Kernel, LocalWorkSize, DynamicSharedMemorySize, GroupCountRet);
}

//
// Events
//
Expand Down
17 changes: 17 additions & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextEnqueueCooperativeKernelLaunch(
pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *OutEvent) {
return pi2ur::piextEnqueueCooperativeKernelLaunch(
Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
pi_context Context,
pi_program Program,
Expand All @@ -541,6 +551,13 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
}

pi_result piextKernelSuggestMaxCooperativeGroupCount(
pi_kernel Kernel, size_t LocalWorkSize, size_t DynamicSharedMemorySize,
pi_uint32 *GroupCountRet) {
return pi2ur::piextKernelSuggestMaxCooperativeGroupCount(
Kernel, LocalWorkSize, DynamicSharedMemorySize, GroupCountRet);
}

pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
return pi2ur::piEventCreate(Context, RetEvent);
}
Expand Down
37 changes: 37 additions & 0 deletions sycl/plugins/unified_runtime/pi2ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2599,6 +2599,19 @@ inline pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
return PI_SUCCESS;
}

inline pi_result piextKernelSuggestMaxCooperativeGroupCount(
pi_kernel Kernel, size_t LocalWorkSize, size_t DynamicSharedMemorySize,
pi_uint32 *GroupCountRet) {
PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
PI_ASSERT(GroupCountRet, PI_ERROR_INVALID_VALUE);

ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
HANDLE_ERRORS(urKernelSuggestMaxCooperativeGroupCountExp(
UrKernel, LocalWorkSize, DynamicSharedMemorySize, GroupCountRet));

return PI_SUCCESS;
}

/// API for writing data from host to a device global variable.
///
/// \param Queue is the queue
Expand Down Expand Up @@ -3669,6 +3682,30 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
return PI_SUCCESS;
}

inline pi_result piextEnqueueCooperativeKernelLaunch(
pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
const pi_event *EventsWaitList, pi_event *OutEvent) {

PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION);

ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
const ur_event_handle_t *UrEventsWaitList =
reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);

ur_event_handle_t *UREvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);

HANDLE_ERRORS(urEnqueueCooperativeKernelLaunchExp(
UrQueue, UrKernel, WorkDim, GlobalWorkOffset, GlobalWorkSize,
LocalWorkSize, NumEventsInWaitList, UrEventsWaitList, UREvent));

return PI_SUCCESS;
}

inline pi_result
piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, pi_bool BlockingWrite,
pi_image_offset Origin, pi_image_region Region,
Expand Down
Loading