Skip to content

Commit b9bf9f5

Browse files
[SYCL][PI][L0] Fix a problem with kernels and programs destruction while they can be used (#2710)
Requirements from summary of level zero spec: 1 – cannot do zeKernelDestroy until all commands using the kernel have completed execution as measured by their event or fence being signaled. 2 – The application must destroy all kernel and build log handles created from the module before destroying the module itself (zeModuleDestroy) So, here is a design I think will work. 1 – We want to do a piKernelRetain sometime before kernel is appended to command-list in piEnqueueKernelLaunch. The intent is that we will do a piKernelRelease once the Event that we create to be associated with the Enqueued Kernel is signaled/released. The handle of kernel to be released will be stored in the event. 2 – Event currently contains fields for CommandType and CommandData, and for the CommandType that piEnqueueKernelLaunch uses, CommandData is currently unused. So, for this case, we will set CommandData to be the Kernel, so then when the event is signaled as done, then we can use the kernel we stored in CommandData, we can do piKernelRelease(CommandData). 3 – I think in piKernelRetain, we also need to call piProgramRetain on the program that the kernel is part of, and we need piKernelRelease to call piProgramRelease. This is necessary to make sure we do not destroy the program before destroying all the kernels that may be in use from the program. 4 – in the destructor for pi_program, we need to destroy the build log before we destroy the program itself.
1 parent 8b55062 commit b9bf9f5

File tree

2 files changed

+90
-56
lines changed

2 files changed

+90
-56
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 82 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ pi_result _pi_device::initialize() {
426426
pi_result
427427
_pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
428428
bool MakeAvailable) {
429-
// Event has been signaled: If the fence for the associated command list
429+
// Event has been signalled: If the fence for the associated command list
430430
// is signalled, then reset the fence and command list and add them to the
431431
// available list for ruse in PI calls.
432432
ZE_CALL(zeFenceReset(this->ZeCommandListFenceMap[ZeCommandList]));
@@ -552,28 +552,9 @@ pi_result _pi_device::getAvailableCommandList(
552552

553553
pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
554554
ze_fence_handle_t ZeFence,
555-
bool IsBlocking) {
556-
// Close the command list and have it ready for dispatch.
557-
ZE_CALL(zeCommandListClose(ZeCommandList));
558-
// Offload command list to the GPU for asynchronous execution
559-
ZE_CALL(zeCommandQueueExecuteCommandLists(ZeCommandQueue, 1, &ZeCommandList,
560-
ZeFence));
561-
562-
// Check global control to make every command blocking for debugging.
563-
if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0) {
564-
// Wait until command lists attached to the command queue are executed.
565-
ZE_CALL(zeCommandQueueSynchronize(ZeCommandQueue, UINT32_MAX));
566-
}
567-
return PI_SUCCESS;
568-
}
569-
570-
bool _pi_queue::isBatchingAllowed() {
571-
return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0));
572-
}
573-
574-
pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList,
575-
ze_fence_handle_t ZeFence) {
576-
if (this->isBatchingAllowed()) {
555+
bool IsBlocking,
556+
bool OKToBatchCommand) {
557+
if (OKToBatchCommand && this->isBatchingAllowed()) {
577558
assert(this->ZeOpenCommandList == nullptr ||
578559
this->ZeOpenCommandList == ZeCommandList);
579560

@@ -596,7 +577,22 @@ pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList,
596577
this->ZeOpenCommandListSize = 0;
597578
}
598579

599-
return executeCommandList(ZeCommandList, ZeFence);
580+
// Close the command list and have it ready for dispatch.
581+
ZE_CALL(zeCommandListClose(ZeCommandList));
582+
// Offload command list to the GPU for asynchronous execution
583+
ZE_CALL(zeCommandQueueExecuteCommandLists(ZeCommandQueue, 1, &ZeCommandList,
584+
ZeFence));
585+
586+
// Check global control to make every command blocking for debugging.
587+
if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0) {
588+
// Wait until command lists attached to the command queue are executed.
589+
ZE_CALL(zeCommandQueueSynchronize(ZeCommandQueue, UINT32_MAX));
590+
}
591+
return PI_SUCCESS;
592+
}
593+
594+
bool _pi_queue::isBatchingAllowed() {
595+
return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0));
600596
}
601597

602598
pi_result _pi_queue::executeOpenCommandList() {
@@ -2759,12 +2755,16 @@ pi_result piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle,
27592755
}
27602756

27612757
_pi_program::~_pi_program() {
2762-
if (ZeModule) {
2763-
ZE_CALL_NOCHECK(zeModuleDestroy(ZeModule));
2764-
}
2758+
// According to Level Zero Specification, all kernels and build logs
2759+
// must be destroyed before the Module can be destroyed. So, be sure
2760+
// to destroy build log before destroying the module.
27652761
if (ZeBuildLog) {
27662762
ZE_CALL_NOCHECK(zeModuleBuildLogDestroy(ZeBuildLog));
27672763
}
2764+
2765+
if (ZeModule) {
2766+
ZE_CALL_NOCHECK(zeModuleDestroy(ZeModule));
2767+
}
27682768
}
27692769

27702770
_pi_program::LinkedReleaser::~LinkedReleaser() {
@@ -2902,6 +2902,10 @@ pi_result piKernelCreate(pi_program Program, const char *KernelName,
29022902
} catch (...) {
29032903
return PI_ERROR_UNKNOWN;
29042904
}
2905+
2906+
// Update the refcount of the program to show its use by this kernel.
2907+
piProgramRetain(Program);
2908+
29052909
return PI_SUCCESS;
29062910
}
29072911

@@ -3091,16 +3095,24 @@ pi_result piKernelRetain(pi_kernel Kernel) {
30913095

30923096
assert(Kernel);
30933097
++(Kernel->RefCount);
3098+
// When retaining a kernel, you are also retaining the program it is part of.
3099+
piProgramRetain(Kernel->Program);
30943100
return PI_SUCCESS;
30953101
}
30963102

30973103
pi_result piKernelRelease(pi_kernel Kernel) {
30983104

30993105
assert(Kernel);
3106+
auto KernelProgram = Kernel->Program;
3107+
31003108
if (--(Kernel->RefCount) == 0) {
31013109
zeKernelDestroy(Kernel->ZeKernel);
31023110
delete Kernel;
31033111
}
3112+
3113+
// do a release on the program this kernel was part of
3114+
piProgramRelease(KernelProgram);
3115+
31043116
return PI_SUCCESS;
31053117
}
31063118

@@ -3112,6 +3124,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
31123124
const pi_event *EventWaitList, pi_event *Event) {
31133125
assert(Kernel);
31143126
assert(Queue);
3127+
assert(Event);
31153128
assert((WorkDim > 0) && (WorkDim < 4));
31163129
if (GlobalWorkOffset != NULL) {
31173130
for (pi_uint32 i = 0; i < WorkDim; i++) {
@@ -3194,17 +3207,26 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
31943207
return Res;
31953208

31963209
ze_event_handle_t ZeEvent = nullptr;
3197-
if (Event) {
3198-
auto Res = piEventCreate(Kernel->Program->Context, Event);
3199-
if (Res != PI_SUCCESS)
3200-
return Res;
3210+
auto Res = piEventCreate(Kernel->Program->Context, Event);
3211+
if (Res != PI_SUCCESS)
3212+
return Res;
32013213

3202-
(*Event)->Queue = Queue;
3203-
(*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL;
3204-
(*Event)->ZeCommandList = ZeCommandList;
3214+
(*Event)->Queue = Queue;
3215+
(*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL;
3216+
(*Event)->ZeCommandList = ZeCommandList;
32053217

3206-
ZeEvent = (*Event)->ZeEvent;
3207-
}
3218+
// Save the kernel in the event, so that when the event is signalled
3219+
// the code can do a piKernelRelease on this kernel.
3220+
(*Event)->CommandData = (void *)Kernel;
3221+
3222+
// Use piKernelRetain to increment the reference count and indicate
3223+
// that the Kernel is in use. Once the event has been signalled, the
3224+
// code in cleanupAfterEvent will do a piReleaseKernel to update
3225+
// the reference count on the kernel, using the kernel saved
3226+
// in CommandData.
3227+
piKernelRetain(Kernel);
3228+
3229+
ZeEvent = (*Event)->ZeEvent;
32083230

32093231
ze_event_handle_t *ZeEventWaitList =
32103232
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);
@@ -3227,7 +3249,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
32273249

32283250
// Execute command list asynchronously, as the event will be used
32293251
// to track down its completion.
3230-
if (auto Res = Queue->batchCommandList(ZeCommandList, ZeFence))
3252+
if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence, false, true))
32313253
return Res;
32323254

32333255
_pi_event::deleteZeEventList(ZeEventWaitList);
@@ -3356,25 +3378,30 @@ pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName,
33563378
return PI_SUCCESS;
33573379
}
33583380

3359-
// Recycle the command list associated with this event.
3360-
static void recycleEventCommandList(pi_event Event) {
3381+
// Perform any necessary cleanup after an event has been signalled.
3382+
// This currently recycles the associate command list, and also makes
3383+
// sure to release any kernel that may have been used by the event.
3384+
static void cleanupAfterEvent(pi_event Event) {
33613385
// The implementation of this is slightly tricky. The same event
33623386
// can be referred to by multiple threads, so it is possible to
3363-
// have a race condition between the read of ZeCommandList and
3364-
// it being reset to nullptr in another thread.
3365-
// But, since the ZeCommandList is uniquely associated with the queue
3387+
// have a race condition between the read of fields of the event,
3388+
// and reseting those fields in some other thread.
3389+
// But, since the event is uniquely associated with the queue
33663390
// for the event, we use the locking that we already have to do on the
33673391
// queue to also serve as the thread safety mechanism for the
3368-
// Event's ZeCommandList.
3392+
// any of the Event's data members that need to be read/reset as
3393+
// part of the cleanup operations.
33693394
auto Queue = Event->Queue;
33703395

33713396
// Lock automatically releases when this goes out of scope.
33723397
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
33733398

3399+
// Cleanup the command list associated with the event if it hasn't
3400+
// been cleaned up already.
33743401
auto EventCommandList = Event->ZeCommandList;
33753402

33763403
if (EventCommandList) {
3377-
// Event has been signaled: If the fence for the associated command list
3404+
// Event has been signalled: If the fence for the associated command list
33783405
// is signalled, then reset the fence and command list and add them to the
33793406
// available list for reuse in PI calls.
33803407
if (Queue->RefCount > 0) {
@@ -3386,6 +3413,13 @@ static void recycleEventCommandList(pi_event Event) {
33863413
}
33873414
}
33883415
}
3416+
3417+
// Release the kernel associated with this event if there is one.
3418+
if (Event->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
3419+
Event->CommandData) {
3420+
piKernelRelease(pi_cast<pi_kernel>(Event->CommandData));
3421+
Event->CommandData = nullptr;
3422+
}
33893423
}
33903424

33913425
pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
@@ -3412,9 +3446,9 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
34123446
zePrint("ZeEvent = %lx\n", pi_cast<std::uintptr_t>(ZeEvent));
34133447
ZE_CALL(zeEventHostSynchronize(ZeEvent, UINT32_MAX));
34143448

3415-
// NOTE: we are destroying associated command lists here to free
3416-
// resources sooner in case RT is not calling piEventRelease soon enough.
3417-
recycleEventCommandList(EventList[I]);
3449+
// NOTE: we are cleaning up after the event here to free resources
3450+
// sooner in case run-time is not calling piEventRelease soon enough.
3451+
cleanupAfterEvent(EventList[I]);
34183452
}
34193453
return PI_SUCCESS;
34203454
}
@@ -3441,7 +3475,7 @@ pi_result piEventRetain(pi_event Event) {
34413475
pi_result piEventRelease(pi_event Event) {
34423476
assert(Event);
34433477
if (--(Event->RefCount) == 0) {
3444-
recycleEventCommandList(Event);
3478+
cleanupAfterEvent(Event);
34453479

34463480
if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP &&
34473481
Event->CommandData) {

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -325,20 +325,20 @@ struct _pi_queue : _pi_object {
325325
pi_result resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
326326
bool MakeAvailable);
327327

328-
// Attach a command list to this queue and allow it to remain open
329-
// and used for further batching. It may be executed immediately,
330-
// or it may be left open for other future command to be batched into.
331-
pi_result batchCommandList(ze_command_list_handle_t ZeCommandList,
332-
ze_fence_handle_t ZeFence);
333-
334328
// Attach a command list to this queue, close, and execute it.
335329
// Note that this command list cannot be appended to after this.
336-
// The "IsBlocking" tells if the wait for completion is requested.
330+
// The "IsBlocking" tells if the wait for completion is required.
337331
// The "ZeFence" passed is used to track when the command list passed
338332
// has completed execution on the device and can be reused.
333+
// If OKToBatchCommand is true, then this command list may be executed
334+
// immediately, or it may be left open for other future command to be
335+
// batched into.
336+
// If IsBlocking is true, then batching will not be allowed regardless
337+
// of the value of OKToBatchCommand
339338
pi_result executeCommandList(ze_command_list_handle_t ZeCommandList,
340339
ze_fence_handle_t ZeFence,
341-
bool IsBlocking = false);
340+
bool IsBlocking = false,
341+
bool OKToBatchCommand = false);
342342

343343
// If there is an open command list associated with this queue,
344344
// close it, exceute it, and reset ZeOpenCommandList, ZeCommandListFence,

0 commit comments

Comments
 (0)