Skip to content

Commit 96a6050

Browse files
BensuoEwanCmfrancepillois
authored
[SYCL][Graph] L0 Backend support for SYCL Graphs (2/4) (#9992)
# Level Zero Backend Support for SYCL Graphs This is the second patch of a series that adds support for an [experimental command graph extension](#5626) A snapshot of the complete work can be seen in draft PR #9375 which has support all the specification defined ways of adding nodes and edges to the graph, including both Explicit and Record & Replay graph construction. The two types of nodes currently implemented are kernel execution and memcpy commands. See https://github.com/reble/llvm#implementation-status for the status of our total work. ## Scope This second patch focuses on the required PI/UR support for the experimental command-buffer feature in the Level Zero adapter: * PI stubs for all adapters to enable compilation, no functionality. * Command-buffer implementation for the Level Zero UR adapter. * Stubs for the CUDA UR adapter to enable compilation, no functionality. ## Following Split PRs Future follow-up PRs with the remainder of our work on the extension will include: * Hooking up backend to graphs runtime, bugfixes and other feature additions, will add symbols but not break the ABI. (3/4) * Add end-to-end tests for SYCL Graph extension. (4/4) * NFC changes - Design doc and codeowner update. ## Authors Co-authored-by: Pablo Reble <[email protected]> Co-authored-by: Julian Miller <[email protected]> Co-authored-by: Ben Tracy <[email protected]> Co-authored-by: Ewan Crawford <[email protected]> Co-authored-by: Maxime France-Pillois <[email protected]> --------- Co-authored-by: Ewan Crawford <[email protected]> Co-authored-by: Maxime France-Pillois <[email protected]>
1 parent 4ca0e06 commit 96a6050

28 files changed

+2618
-2
lines changed

sycl/include/sycl/detail/pi.def

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,4 +158,19 @@ _PI_API(piextEnqueueDeviceGlobalVariableRead)
158158

159159
_PI_API(piPluginGetBackendOption)
160160

161+
// command-buffer Extension
162+
_PI_API(piextCommandBufferCreate)
163+
_PI_API(piextCommandBufferRetain)
164+
_PI_API(piextCommandBufferRelease)
165+
_PI_API(piextCommandBufferFinalize)
166+
_PI_API(piextCommandBufferNDRangeKernel)
167+
_PI_API(piextCommandBufferMemcpyUSM)
168+
_PI_API(piextCommandBufferMemBufferCopy)
169+
_PI_API(piextCommandBufferMemBufferCopyRect)
170+
_PI_API(piextCommandBufferMemBufferWrite)
171+
_PI_API(piextCommandBufferMemBufferWriteRect)
172+
_PI_API(piextCommandBufferMemBufferRead)
173+
_PI_API(piextCommandBufferMemBufferReadRect)
174+
_PI_API(piextEnqueueCommandBuffer)
175+
161176
#undef _PI_API

sycl/include/sycl/detail/pi.h

Lines changed: 225 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,10 @@
9696
// piextQueueGetNativeHandle
9797
// 14.33 Added new parameter (memory object properties) to
9898
// piextKernelSetArgMemObj
99+
// 14.34 Added command-buffer extension methods
99100

100101
#define _PI_H_VERSION_MAJOR 14
101-
#define _PI_H_VERSION_MINOR 33
102+
#define _PI_H_VERSION_MINOR 34
102103

103104
#define _PI_STRING_HELPER(a) #a
104105
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -477,6 +478,7 @@ typedef enum {
477478
PI_COMMAND_TYPE_SVM_MEMFILL = 0x120B,
478479
PI_COMMAND_TYPE_SVM_MAP = 0x120C,
479480
PI_COMMAND_TYPE_SVM_UNMAP = 0x120D,
481+
PI_COMMAND_TYPE_EXT_COMMAND_BUFFER = 0x12A8,
480482
PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_READ = 0x418E,
481483
PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_WRITE = 0x418F
482484
} _pi_command_type;
@@ -2128,6 +2130,228 @@ __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device,
21282130
uint64_t *DeviceTime,
21292131
uint64_t *HostTime);
21302132

2133+
/// Command buffer extension
2134+
struct _pi_ext_command_buffer;
2135+
struct _pi_ext_sync_point;
2136+
using pi_ext_command_buffer = _pi_ext_command_buffer *;
2137+
using pi_ext_sync_point = pi_uint32;
2138+
2139+
typedef enum {
2140+
PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC = 0
2141+
} pi_ext_structure_type;
2142+
2143+
struct pi_ext_command_buffer_desc final {
2144+
pi_ext_structure_type stype;
2145+
const void *pNext;
2146+
pi_queue_properties *properties;
2147+
};
2148+
2149+
/// API to create a command-buffer.
2150+
/// \param context The context to associate the command-buffer with.
2151+
/// \param device The device to associate the command-buffer with.
2152+
/// \param desc Descriptor for the new command-buffer.
2153+
/// \param ret_command_buffer Pointer to fill with the address of the new
2154+
/// command-buffer.
2155+
__SYCL_EXPORT pi_result
2156+
piextCommandBufferCreate(pi_context context, pi_device device,
2157+
const pi_ext_command_buffer_desc *desc,
2158+
pi_ext_command_buffer *ret_command_buffer);
2159+
2160+
/// API to increment the reference count of the command-buffer
2161+
/// \param command_buffer The command_buffer to retain.
2162+
__SYCL_EXPORT pi_result
2163+
piextCommandBufferRetain(pi_ext_command_buffer command_buffer);
2164+
2165+
/// API to decrement the reference count of the command-buffer. After the
2166+
/// command_buffer reference count becomes zero and has finished execution, the
2167+
/// command-buffer is deleted.
2168+
/// \param command_buffer The command_buffer to release.
2169+
__SYCL_EXPORT pi_result
2170+
piextCommandBufferRelease(pi_ext_command_buffer command_buffer);
2171+
2172+
/// API to stop command-buffer recording such that no more commands can be
2173+
/// appended, and makes the command-buffer ready to enqueue on a command-queue.
2174+
/// \param command_buffer The command_buffer to finalize.
2175+
__SYCL_EXPORT pi_result
2176+
piextCommandBufferFinalize(pi_ext_command_buffer command_buffer);
2177+
2178+
/// API to append a kernel execution command to the command-buffer.
2179+
/// \param command_buffer The command-buffer to append onto.
2180+
/// \param kernel The kernel to append.
2181+
/// \param work_dim Dimension of the kernel execution.
2182+
/// \param global_work_offset Offset to use when executing kernel.
2183+
/// \param global_work_size Global work size to use when executing kernel.
2184+
/// \param local_work_size Local work size to use when executing kernel.
2185+
/// \param num_sync_points_in_wait_list The number of sync points in the
2186+
/// provided wait list.
2187+
/// \param sync_point_wait_list A list of sync points that this command must
2188+
/// wait on.
2189+
/// \param sync_point The sync_point associated with this kernel execution.
2190+
__SYCL_EXPORT pi_result piextCommandBufferNDRangeKernel(
2191+
pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim,
2192+
const size_t *global_work_offset, const size_t *global_work_size,
2193+
const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list,
2194+
const pi_ext_sync_point *sync_point_wait_list,
2195+
pi_ext_sync_point *sync_point);
2196+
2197+
/// API to append a USM memcpy command to the command-buffer.
2198+
/// \param command_buffer The command-buffer to append onto.
2199+
/// \param dst_ptr is the location the data will be copied
2200+
/// \param src_ptr is the data to be copied
2201+
/// \param size is number of bytes to copy
2202+
/// \param num_sync_points_in_wait_list The number of sync points in the
2203+
/// provided wait list.
2204+
/// \param sync_point_wait_list A list of sync points that this command must
2205+
/// wait on.
2206+
/// \param sync_point The sync_point associated with this memory operation.
2207+
__SYCL_EXPORT pi_result piextCommandBufferMemcpyUSM(
2208+
pi_ext_command_buffer command_buffer, void *dst_ptr, const void *src_ptr,
2209+
size_t size, pi_uint32 num_sync_points_in_wait_list,
2210+
const pi_ext_sync_point *sync_point_wait_list,
2211+
pi_ext_sync_point *sync_point);
2212+
2213+
/// API to append a mem buffer copy command to the command-buffer.
2214+
/// \param command_buffer The command-buffer to append onto.
2215+
/// \param src_buffer is the data to be copied
2216+
/// \param dst_buffer is the location the data will be copied
2217+
/// \param src_offset offset into \p src_buffer
2218+
/// \param dst_offset offset into \p dst_buffer
2219+
/// \param size is number of bytes to copy
2220+
/// \param num_sync_points_in_wait_list The number of sync points in the
2221+
/// provided wait list.
2222+
/// \param sync_point_wait_list A list of sync points that this command must
2223+
/// wait on.
2224+
/// \param sync_point The sync_point associated with this memory operation.
2225+
__SYCL_EXPORT pi_result piextCommandBufferMemBufferCopy(
2226+
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
2227+
size_t src_offset, size_t dst_offset, size_t size,
2228+
pi_uint32 num_sync_points_in_wait_list,
2229+
const pi_ext_sync_point *sync_point_wait_list,
2230+
pi_ext_sync_point *sync_point);
2231+
2232+
/// API to append a rectangular mem buffer copy command to the command-buffer.
2233+
/// \param command_buffer The command-buffer to append onto.
2234+
/// \param src_buffer is the data to be copied
2235+
/// \param dst_buffer is the location the data will be copied
2236+
/// \param src_origin offset for the start of the region to copy in src_buffer
2237+
/// \param dst_origin offset for the start of the region to copy in dst_buffer
2238+
/// \param region The size of the region to be copied
2239+
/// \param src_row_pitch Row pitch for the src data
2240+
/// \param src_slice_pitch Slice pitch for the src data
2241+
/// \param dst_row_pitch Row pitch for the dst data
2242+
/// \param dst_slice_pitch Slice pitch for the dst data
2243+
/// \param num_sync_points_in_wait_list The number of sync points in the
2244+
/// provided wait list.
2245+
/// \param sync_point_wait_list A list of sync points that this command must
2246+
/// wait on.
2247+
/// \param sync_point The sync_point associated with this memory operation.
2248+
__SYCL_EXPORT pi_result piextCommandBufferMemBufferCopyRect(
2249+
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
2250+
pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
2251+
pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
2252+
size_t dst_row_pitch, size_t dst_slice_pitch,
2253+
pi_uint32 num_sync_points_in_wait_list,
2254+
const pi_ext_sync_point *sync_point_wait_list,
2255+
pi_ext_sync_point *sync_point);
2256+
2257+
/// API to append a mem buffer read command to the command-buffer.
2258+
/// \param command_buffer The command-buffer to append onto.
2259+
/// \param buffer is the data to be read
2260+
/// \param offset offset into \p buffer
2261+
/// \param size is number of bytes to read
2262+
/// \param dst is the pointer to the destination
2263+
/// \param num_sync_points_in_wait_list The number of sync points in the
2264+
/// provided wait list.
2265+
/// \param sync_point_wait_list A list of sync points that this command must
2266+
/// wait on.
2267+
/// \param sync_point The sync_point associated with this memory operation.
2268+
__SYCL_EXPORT pi_result piextCommandBufferMemBufferRead(
2269+
pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset,
2270+
size_t size, void *dst, pi_uint32 num_events_in_wait_list,
2271+
const pi_ext_sync_point *sync_point_wait_list,
2272+
pi_ext_sync_point *sync_point);
2273+
2274+
/// API to append a rectangular mem buffer read command to the command-buffer.
2275+
/// \param command_buffer The command-buffer to append onto.
2276+
/// \param buffer is the data to be read
2277+
/// \param buffer_offset offset for the start of the region to read in buffer
2278+
/// \param host_offset offset for the start of the region to be written from ptr
2279+
/// \param region The size of the region to read
2280+
/// \param buffer_row_pitch Row pitch for the source buffer data
2281+
/// \param buffer_slice_pitch Slice pitch for the source buffer data
2282+
/// \param host_row_pitch Row pitch for the destination data ptr
2283+
/// \param host_slice_pitch Slice pitch for the destination data ptr
2284+
/// \param ptr is the location the data will be written
2285+
/// \param num_sync_points_in_wait_list The number of sync points in the
2286+
/// provided wait list.
2287+
/// \param sync_point_wait_list A list of sync points that this command must
2288+
/// wait on.
2289+
/// \param sync_point The sync_point associated with this memory operation.
2290+
__SYCL_EXPORT pi_result piextCommandBufferMemBufferReadRect(
2291+
pi_ext_command_buffer command_buffer, pi_mem buffer,
2292+
pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
2293+
pi_buff_rect_region region, size_t buffer_row_pitch,
2294+
size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
2295+
void *ptr, pi_uint32 num_events_in_wait_list,
2296+
const pi_ext_sync_point *sync_point_wait_list,
2297+
pi_ext_sync_point *sync_point);
2298+
2299+
/// API to append a mem buffer write command to the command-buffer.
2300+
/// \param command_buffer The command-buffer to append onto.
2301+
/// \param buffer is the location to write the data
2302+
/// \param offset offset into \p buffer
2303+
/// \param size is number of bytes to write
2304+
/// \param ptr is the pointer to the source
2305+
/// \param num_sync_points_in_wait_list The number of sync points in the
2306+
/// provided wait list.
2307+
/// \param sync_point_wait_list A list of sync points that this command must
2308+
/// wait on.
2309+
/// \param sync_point The sync_point associated with this memory operation.
2310+
__SYCL_EXPORT pi_result piextCommandBufferMemBufferWrite(
2311+
pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset,
2312+
size_t size, const void *ptr, pi_uint32 num_events_in_wait_list,
2313+
const pi_ext_sync_point *sync_point_wait_list,
2314+
pi_ext_sync_point *sync_point);
2315+
2316+
/// API to append a rectangular mem buffer write command to the command-buffer.
2317+
/// \param command_buffer The command-buffer to append onto.
2318+
/// \param buffer is the location to write the data
2319+
/// \param buffer_offset offset for the start of the region to write in buffer
2320+
/// \param host_offset offset for the start of the region to be read from ptr
2321+
/// \param region The size of the region to write
2322+
/// \param buffer_row_pitch Row pitch for the buffer data
2323+
/// \param buffer_slice_pitch Slice pitch for the buffer data
2324+
/// \param host_row_pitch Row pitch for the source data ptr
2325+
/// \param host_slice_pitch Slice pitch for the source data ptr
2326+
/// \param ptr is the pointer to the source
2327+
/// \param num_sync_points_in_wait_list The number of sync points in the
2328+
/// provided wait list.
2329+
/// \param sync_point_wait_list A list of sync points that this command must
2330+
/// wait on.
2331+
/// \param sync_point The sync_point associated with this memory operation.
2332+
__SYCL_EXPORT pi_result piextCommandBufferMemBufferWriteRect(
2333+
pi_ext_command_buffer command_buffer, pi_mem buffer,
2334+
pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
2335+
pi_buff_rect_region region, size_t buffer_row_pitch,
2336+
size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
2337+
const void *ptr, pi_uint32 num_events_in_wait_list,
2338+
const pi_ext_sync_point *sync_point_wait_list,
2339+
pi_ext_sync_point *sync_point);
2340+
2341+
/// API to submit the command-buffer to queue for execution, returns an error if
2342+
/// the command-buffer is not finalized or another instance of the same
2343+
/// command-buffer is currently executing.
2344+
/// \param command_buffer The command-buffer to be submitted.
2345+
/// \param queue The PI queue to submit on.
2346+
/// \param num_events_in_wait_list The number of events that this execution
2347+
/// depends on.
2348+
/// \param event_wait_list List of pi_events to wait on.
2349+
/// \param event The pi_event associated with this enqueue.
2350+
__SYCL_EXPORT pi_result
2351+
piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue,
2352+
pi_uint32 num_events_in_wait_list,
2353+
const pi_event *event_wait_list, pi_event *event);
2354+
21312355
struct _pi_plugin {
21322356
// PI version supported by host passed to the plugin. The Plugin
21332357
// checks and writes the appropriate Function Pointers in

sycl/include/sycl/detail/pi.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ using PiMemObjectType = ::pi_mem_type;
148148
using PiMemImageChannelOrder = ::pi_image_channel_order;
149149
using PiMemImageChannelType = ::pi_image_channel_type;
150150
using PiKernelCacheConfig = ::pi_kernel_cache_config;
151+
using PiExtSyncPoint = ::pi_ext_sync_point;
152+
using PiExtCommandBuffer = ::pi_ext_command_buffer;
153+
using PiExtCommandBufferDesc = ::pi_ext_command_buffer_desc;
151154

152155
__SYCL_EXPORT void contextSetExtendedDeleter(const sycl::context &constext,
153156
pi_context_extended_deleter func,

sycl/plugins/cuda/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ add_sycl_plugin(cuda
7979
"../unified_runtime/ur/adapters/cuda/tracing.cpp"
8080
"../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
8181
"../unified_runtime/ur/adapters/cuda/usm.cpp"
82+
"../unified_runtime/ur/adapters/cuda/command_buffer.hpp"
83+
"../unified_runtime/ur/adapters/cuda/command_buffer.cpp"
8284
# ---
8385
"${sycl_inc_dir}/sycl/detail/pi.h"
8486
"${sycl_inc_dir}/sycl/detail/pi.hpp"

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,19 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
186186
_PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer)
187187
_PI_CL(piPluginGetBackendOption, pi2ur::piPluginGetBackendOption)
188188

189+
// command-buffer
190+
_PI_CL(piextCommandBufferCreate, pi2ur::piextCommandBufferCreate)
191+
_PI_CL(piextCommandBufferRetain, pi2ur::piextCommandBufferRetain)
192+
_PI_CL(piextCommandBufferRelease, pi2ur::piextCommandBufferRelease)
193+
_PI_CL(piextCommandBufferNDRangeKernel,
194+
pi2ur::piextCommandBufferNDRangeKernel)
195+
_PI_CL(piextCommandBufferMemcpyUSM, pi2ur::piextCommandBufferMemcpyUSM)
196+
_PI_CL(piextCommandBufferMemBufferCopy,
197+
pi2ur::piextCommandBufferMemBufferCopy)
198+
_PI_CL(piextCommandBufferMemBufferCopyRect,
199+
pi2ur::piextCommandBufferMemBufferCopyRect)
200+
_PI_CL(piextEnqueueCommandBuffer, pi2ur::piextEnqueueCommandBuffer)
201+
189202
#undef _PI_CL
190203

191204
return PI_SUCCESS;

sycl/plugins/cuda/pi_cuda.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#define _PI_CUDA_PLUGIN_VERSION_STRING \
2626
_PI_PLUGIN_VERSION_STRING(_PI_CUDA_PLUGIN_VERSION)
2727

28+
#include <ur/adapters/cuda/command_buffer.hpp>
2829
#include <ur/adapters/cuda/context.hpp>
2930
#include <ur/adapters/cuda/device.hpp>
3031
#include <ur/adapters/cuda/event.hpp>
@@ -76,4 +77,8 @@ struct _pi_sampler : ur_sampler_handle_t_ {
7677
using ur_sampler_handle_t_::ur_sampler_handle_t_;
7778
};
7879

80+
struct _pi_ext_command_buffer : ur_exp_command_buffer_handle_t_ {
81+
using ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_;
82+
};
83+
7984
#endif // PI_CUDA_HPP

0 commit comments

Comments
 (0)