Skip to content

Commit b9d72a9

Browse files
[SYCL][PI][L0] Add batching of multiple command into a command list before executing that command list. (#2605)
* [SYCl][PI][L0] Add support for batching multiple commands into a command list prior to executing that command list. * Fix for overly aggressive assertion. Needed to executeOpenCommandList in this case. * Added checking for return value from executeOpenCommandList. * Added comment on QueueBatchSize, and removed unneeded check of RefCount in executeOpenCommandList as requested in code review.
1 parent c81828e commit b9d72a9

File tree

3 files changed

+154
-13
lines changed

3 files changed

+154
-13
lines changed

sycl/doc/EnvironmentVariables.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ subject to change. Do not rely on these variables in production code.
2828
| SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
2929
| SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
3030
| SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
31+
| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. |
3132

3233
`(*) Note: Any means this environment variable is effective when set to any non-null value.`
3334

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 115 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -435,12 +435,46 @@ _pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
435435
return PI_SUCCESS;
436436
}
437437

438+
static const pi_uint32 ZeCommandListBatchSize = [] {
439+
// Default value of 4. This has been seen as a good tradeoff between
440+
// lower overhead of number of enqueue and fence calls, and getting
441+
// commands seen as soon possible (i.e. lazy vs eager submission).
442+
pi_uint32 BatchSizeVal = 4;
443+
const auto BatchSizeStr = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
444+
if (BatchSizeStr) {
445+
pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
446+
// Level Zero may only support a limted number of commands per command
447+
// list. The actual upper limit is not specified by the Level Zero
448+
// Specification. For now we allow an arbitrary upper limit.
449+
// Negative numbers will be silently ignored.
450+
if (BatchSizeStrVal >= 0)
451+
BatchSizeVal = BatchSizeStrVal;
452+
}
453+
return BatchSizeVal;
454+
}();
455+
438456
// Retrieve an available command list to be used in a PI call
439457
// Caller must hold a lock on the Queue passed in.
440-
pi_result
441-
_pi_device::getAvailableCommandList(pi_queue Queue,
442-
ze_command_list_handle_t *ZeCommandList,
443-
ze_fence_handle_t *ZeFence) {
458+
pi_result _pi_device::getAvailableCommandList(
459+
pi_queue Queue, ze_command_list_handle_t *ZeCommandList,
460+
ze_fence_handle_t *ZeFence, bool AllowBatching) {
461+
// First see if there is an command-list open for batching commands
462+
// for this queue.
463+
if (Queue->ZeOpenCommandList) {
464+
if (AllowBatching) {
465+
*ZeCommandList = Queue->ZeOpenCommandList;
466+
*ZeFence = Queue->ZeOpenCommandListFence;
467+
return PI_SUCCESS;
468+
}
469+
470+
// If this command isn't allowed to be batched, then we need to
471+
// go ahead and execute what is already in the batched list,
472+
// and then go on to process this. On exit from executeOpenCommandList
473+
// ZeOpenCommandList will be nullptr.
474+
if (auto Res = Queue->executeOpenCommandList())
475+
return Res;
476+
}
477+
444478
// Create/Reuse the command list, because in Level Zero commands are added to
445479
// the command lists, and later are then added to the command queue.
446480
// Each command list is paired with an associated fence to track when the
@@ -525,6 +559,55 @@ pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
525559
return PI_SUCCESS;
526560
}
527561

562+
bool _pi_queue::isBatchingAllowed() {
563+
return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0));
564+
}
565+
566+
pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList,
567+
ze_fence_handle_t ZeFence) {
568+
if (this->isBatchingAllowed()) {
569+
assert(this->ZeOpenCommandList == nullptr ||
570+
this->ZeOpenCommandList == ZeCommandList);
571+
572+
if (this->ZeOpenCommandListSize + 1 < QueueBatchSize) {
573+
this->ZeOpenCommandList = ZeCommandList;
574+
this->ZeOpenCommandListFence = ZeFence;
575+
576+
// NOTE: we don't know here how many commands are in the ZeCommandList
577+
// but most PI interfaces translate to a single Level-Zero command.
578+
// Some do translate to multiple commands so we may be undercounting
579+
// a bit here, but this is a heuristic, not an exact measure.
580+
//
581+
this->ZeOpenCommandListSize += 1;
582+
583+
return PI_SUCCESS;
584+
}
585+
586+
this->ZeOpenCommandList = nullptr;
587+
this->ZeOpenCommandListFence = nullptr;
588+
this->ZeOpenCommandListSize = 0;
589+
}
590+
591+
return executeCommandList(ZeCommandList, ZeFence);
592+
}
593+
594+
pi_result _pi_queue::executeOpenCommandList() {
595+
// If there are any commands still in the open command list for this
596+
// queue, then close and execute that command list now.
597+
auto OpenList = this->ZeOpenCommandList;
598+
if (OpenList) {
599+
auto OpenListFence = this->ZeOpenCommandListFence;
600+
601+
this->ZeOpenCommandList = nullptr;
602+
this->ZeOpenCommandListFence = nullptr;
603+
this->ZeOpenCommandListSize = 0;
604+
605+
return executeCommandList(OpenList, OpenListFence);
606+
}
607+
608+
return PI_SUCCESS;
609+
}
610+
528611
ze_event_handle_t *_pi_event::createZeEventList(pi_uint32 EventListLength,
529612
const pi_event *EventList) {
530613
try {
@@ -1650,7 +1733,8 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
16501733

16511734
assert(Queue);
16521735
try {
1653-
*Queue = new _pi_queue(ZeCommandQueue, Context, Device);
1736+
*Queue =
1737+
new _pi_queue(ZeCommandQueue, Context, Device, ZeCommandListBatchSize);
16541738
} catch (const std::bad_alloc &) {
16551739
return PI_OUT_OF_HOST_MEMORY;
16561740
} catch (...) {
@@ -1706,6 +1790,12 @@ pi_result piQueueRelease(pi_queue Queue) {
17061790
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
17071791

17081792
if (--(Queue->RefCount) == 0) {
1793+
// It is possible to get to here and still have an open command list
1794+
// if no wait or finish ever occurred for this queue. But still need
1795+
// to make sure commands get executed.
1796+
if (auto Res = Queue->executeOpenCommandList())
1797+
return Res;
1798+
17091799
// Destroy all the fences created associated with this queue.
17101800
for (const auto &MapEntry : Queue->ZeCommandListFenceMap) {
17111801
ZE_CALL(zeFenceDestroy(MapEntry.second));
@@ -1724,6 +1814,10 @@ pi_result piQueueFinish(pi_queue Queue) {
17241814
// Lock automatically releases when this goes out of scope.
17251815
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
17261816

1817+
// execute any command list that may still be open.
1818+
if (auto Res = Queue->executeOpenCommandList())
1819+
return Res;
1820+
17271821
ZE_CALL(zeCommandQueueSynchronize(Queue->ZeCommandQueue, UINT32_MAX));
17281822
return PI_SUCCESS;
17291823
}
@@ -1754,7 +1848,7 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
17541848
// Attach the queue to the "0" device.
17551849
// TODO: see if we need to let user choose the device.
17561850
pi_device Device = Context->Devices[0];
1757-
*Queue = new _pi_queue(ZeQueue, Context, Device);
1851+
*Queue = new _pi_queue(ZeQueue, Context, Device, ZeCommandListBatchSize);
17581852
return PI_SUCCESS;
17591853
}
17601854

@@ -3022,7 +3116,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
30223116
ze_command_list_handle_t ZeCommandList = nullptr;
30233117
ze_fence_handle_t ZeFence = nullptr;
30243118
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
3025-
&ZeFence))
3119+
&ZeFence, true))
30263120
return Res;
30273121

30283122
ze_event_handle_t ZeEvent = nullptr;
@@ -3059,7 +3153,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
30593153

30603154
// Execute command list asynchronously, as the event will be used
30613155
// to track down its completion.
3062-
if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence))
3156+
if (auto Res = Queue->batchCommandList(ZeCommandList, ZeFence))
30633157
return Res;
30643158

30653159
_pi_event::deleteZeEventList(ZeEventWaitList);
@@ -3194,6 +3288,19 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
31943288
return PI_INVALID_EVENT;
31953289
}
31963290

3291+
// Submit dependent open command lists for execution, if any
3292+
for (uint32_t I = 0; I < NumEvents; I++) {
3293+
auto Queue = EventList[I]->Queue;
3294+
3295+
// Lock automatically releases when this goes out of scope.
3296+
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
3297+
3298+
if (Queue->RefCount > 0) {
3299+
if (auto Res = Queue->executeOpenCommandList())
3300+
return Res;
3301+
}
3302+
}
3303+
31973304
for (uint32_t I = 0; I < NumEvents; I++) {
31983305
ze_event_handle_t ZeEvent = EventList[I]->ZeEvent;
31993306
zePrint("ZeEvent = %lx\n", pi_cast<std::uintptr_t>(ZeEvent));

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,13 @@ struct _pi_device : _pi_object {
177177
// caller must pass a command queue to create a new fence for the new command
178178
// list if a command list/fence pair is not available. All Command Lists &
179179
// associated fences are destroyed at Device Release.
180+
// If AllowBatching is true, then the command list returned may already have
181+
// command in it, if AllowBatching is false, any open command lists that
182+
// already exist in Queue will be closed and executed.
180183
pi_result getAvailableCommandList(pi_queue Queue,
181184
ze_command_list_handle_t *ZeCommandList,
182-
ze_fence_handle_t *ZeFence);
185+
ze_fence_handle_t *ZeFence,
186+
bool AllowBatching = false);
183187

184188
// Cache of the immutable device properties.
185189
ze_device_properties_t ZeDeviceProperties;
@@ -268,8 +272,9 @@ struct _pi_context : _pi_object {
268272

269273
struct _pi_queue : _pi_object {
270274
_pi_queue(ze_command_queue_handle_t Queue, pi_context Context,
271-
pi_device Device)
272-
: ZeCommandQueue{Queue}, Context{Context}, Device{Device} {}
275+
pi_device Device, pi_uint32 QueueBatchSize)
276+
: ZeCommandQueue{Queue}, Context{Context}, Device{Device},
277+
QueueBatchSize{QueueBatchSize} {}
273278

274279
// Level Zero command queue handle.
275280
ze_command_queue_handle_t ZeCommandQueue;
@@ -291,25 +296,53 @@ struct _pi_queue : _pi_object {
291296
// needed/used for the queue data structures.
292297
std::mutex PiQueueMutex;
293298

299+
// Open command list field for batching commands into this queue.
300+
ze_command_list_handle_t ZeOpenCommandList = {nullptr};
301+
ze_fence_handle_t ZeOpenCommandListFence = {nullptr};
302+
pi_uint32 ZeOpenCommandListSize = {0};
303+
304+
// Approximate number of commands that are allowed to be batched for
305+
// this queue.
306+
// Added this member to the queue rather than using a global variable
307+
// so that future implementation could use heuristics to change this on
308+
// a queue specific basis. And by putting it in the queue itself, this
309+
// is thread safe because of the locking of the queue that occurs.
310+
pi_uint32 QueueBatchSize = {0};
311+
294312
// Map of all Command lists created with their associated Fence used for
295313
// tracking when the command list is available for use again.
296314
std::map<ze_command_list_handle_t, ze_fence_handle_t> ZeCommandListFenceMap;
297315

316+
// Returns true if any commands for this queue are allowed to
317+
// be batched together.
318+
bool isBatchingAllowed();
319+
298320
// Resets the Command List and Associated fence in the ZeCommandListFenceMap.
299321
// If the reset command list should be made available, then MakeAvailable
300322
// needs to be set to true. The caller must verify that this command list and
301323
// fence have been signalled.
302324
pi_result resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
303325
bool MakeAvailable);
304326

327+
// Attach a command list to this queue and allow it to remain open
328+
// and used for further batching. It may be executed immediately,
329+
// or it may be left open for other future command to be batched into.
330+
pi_result batchCommandList(ze_command_list_handle_t ZeCommandList,
331+
ze_fence_handle_t ZeFence);
332+
305333
// Attach a command list to this queue, close, and execute it.
306334
// Note that this command list cannot be appended to after this.
307-
// The "is_blocking" tells if the wait for completion is requested.
335+
// The "IsBlocking" tells if the wait for completion is requested.
308336
// The "ZeFence" passed is used to track when the command list passed
309337
// has completed execution on the device and can be reused.
310338
pi_result executeCommandList(ze_command_list_handle_t ZeCommandList,
311339
ze_fence_handle_t ZeFence,
312-
bool is_blocking = false);
340+
bool IsBlocking = false);
341+
342+
// If there is an open command list associated with this queue,
343+
// close it, exceute it, and reset ZeOpenCommandList, ZeCommandListFence,
344+
// and ZeOpenCommandListSize.
345+
pi_result executeOpenCommandList();
313346
};
314347

315348
struct _pi_mem : _pi_object {

0 commit comments

Comments
 (0)