@@ -426,7 +426,7 @@ pi_result _pi_device::initialize() {
426
426
pi_result
427
427
_pi_queue::resetCommandListFenceEntry (ze_command_list_handle_t ZeCommandList,
428
428
bool MakeAvailable) {
429
- // Event has been signaled : If the fence for the associated command list
429
+ // Event has been signalled : If the fence for the associated command list
430
430
// is signalled, then reset the fence and command list and add them to the
431
431
// available list for ruse in PI calls.
432
432
ZE_CALL (zeFenceReset (this ->ZeCommandListFenceMap [ZeCommandList]));
@@ -552,28 +552,9 @@ pi_result _pi_device::getAvailableCommandList(
552
552
553
553
pi_result _pi_queue::executeCommandList (ze_command_list_handle_t ZeCommandList,
554
554
ze_fence_handle_t ZeFence,
555
- bool IsBlocking) {
556
- // Close the command list and have it ready for dispatch.
557
- ZE_CALL (zeCommandListClose (ZeCommandList));
558
- // Offload command list to the GPU for asynchronous execution
559
- ZE_CALL (zeCommandQueueExecuteCommandLists (ZeCommandQueue, 1 , &ZeCommandList,
560
- ZeFence));
561
-
562
- // Check global control to make every command blocking for debugging.
563
- if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0 ) {
564
- // Wait until command lists attached to the command queue are executed.
565
- ZE_CALL (zeCommandQueueSynchronize (ZeCommandQueue, UINT32_MAX));
566
- }
567
- return PI_SUCCESS;
568
- }
569
-
570
- bool _pi_queue::isBatchingAllowed () {
571
- return (this ->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0 ));
572
- }
573
-
574
- pi_result _pi_queue::batchCommandList (ze_command_list_handle_t ZeCommandList,
575
- ze_fence_handle_t ZeFence) {
576
- if (this ->isBatchingAllowed ()) {
555
+ bool IsBlocking,
556
+ bool OKToBatchCommand) {
557
+ if (OKToBatchCommand && this ->isBatchingAllowed ()) {
577
558
assert (this ->ZeOpenCommandList == nullptr ||
578
559
this ->ZeOpenCommandList == ZeCommandList);
579
560
@@ -596,7 +577,22 @@ pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList,
596
577
this ->ZeOpenCommandListSize = 0 ;
597
578
}
598
579
599
- return executeCommandList (ZeCommandList, ZeFence);
580
+ // Close the command list and have it ready for dispatch.
581
+ ZE_CALL (zeCommandListClose (ZeCommandList));
582
+ // Offload command list to the GPU for asynchronous execution
583
+ ZE_CALL (zeCommandQueueExecuteCommandLists (ZeCommandQueue, 1 , &ZeCommandList,
584
+ ZeFence));
585
+
586
+ // Check global control to make every command blocking for debugging.
587
+ if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0 ) {
588
+ // Wait until command lists attached to the command queue are executed.
589
+ ZE_CALL (zeCommandQueueSynchronize (ZeCommandQueue, UINT32_MAX));
590
+ }
591
+ return PI_SUCCESS;
592
+ }
593
+
594
+ bool _pi_queue::isBatchingAllowed () {
595
+ return (this ->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0 ));
600
596
}
601
597
602
598
pi_result _pi_queue::executeOpenCommandList () {
@@ -2759,12 +2755,16 @@ pi_result piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle,
2759
2755
}
2760
2756
2761
2757
_pi_program::~_pi_program () {
2762
- if (ZeModule) {
2763
- ZE_CALL_NOCHECK ( zeModuleDestroy (ZeModule));
2764
- }
2758
+ // According to Level Zero Specification, all kernels and build logs
2759
+ // must be destroyed before the Module can be destroyed. So, be sure
2760
+ // to destroy build log before destroying the module.
2765
2761
if (ZeBuildLog) {
2766
2762
ZE_CALL_NOCHECK (zeModuleBuildLogDestroy (ZeBuildLog));
2767
2763
}
2764
+
2765
+ if (ZeModule) {
2766
+ ZE_CALL_NOCHECK (zeModuleDestroy (ZeModule));
2767
+ }
2768
2768
}
2769
2769
2770
2770
_pi_program::LinkedReleaser::~LinkedReleaser () {
@@ -2902,6 +2902,10 @@ pi_result piKernelCreate(pi_program Program, const char *KernelName,
2902
2902
} catch (...) {
2903
2903
return PI_ERROR_UNKNOWN;
2904
2904
}
2905
+
2906
+ // Update the refcount of the program to show its use by this kernel.
2907
+ piProgramRetain (Program);
2908
+
2905
2909
return PI_SUCCESS;
2906
2910
}
2907
2911
@@ -3091,16 +3095,24 @@ pi_result piKernelRetain(pi_kernel Kernel) {
3091
3095
3092
3096
assert (Kernel);
3093
3097
++(Kernel->RefCount );
3098
+ // When retaining a kernel, you are also retaining the program it is part of.
3099
+ piProgramRetain (Kernel->Program );
3094
3100
return PI_SUCCESS;
3095
3101
}
3096
3102
3097
3103
pi_result piKernelRelease (pi_kernel Kernel) {
3098
3104
3099
3105
assert (Kernel);
3106
+ auto KernelProgram = Kernel->Program ;
3107
+
3100
3108
if (--(Kernel->RefCount ) == 0 ) {
3101
3109
zeKernelDestroy (Kernel->ZeKernel );
3102
3110
delete Kernel;
3103
3111
}
3112
+
3113
+ // do a release on the program this kernel was part of
3114
+ piProgramRelease (KernelProgram);
3115
+
3104
3116
return PI_SUCCESS;
3105
3117
}
3106
3118
@@ -3112,6 +3124,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
3112
3124
const pi_event *EventWaitList, pi_event *Event) {
3113
3125
assert (Kernel);
3114
3126
assert (Queue);
3127
+ assert (Event);
3115
3128
assert ((WorkDim > 0 ) && (WorkDim < 4 ));
3116
3129
if (GlobalWorkOffset != NULL ) {
3117
3130
for (pi_uint32 i = 0 ; i < WorkDim; i++) {
@@ -3194,17 +3207,26 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
3194
3207
return Res;
3195
3208
3196
3209
ze_event_handle_t ZeEvent = nullptr ;
3197
- if (Event) {
3198
- auto Res = piEventCreate (Kernel->Program ->Context , Event);
3199
- if (Res != PI_SUCCESS)
3200
- return Res;
3210
+ auto Res = piEventCreate (Kernel->Program ->Context , Event);
3211
+ if (Res != PI_SUCCESS)
3212
+ return Res;
3201
3213
3202
- (*Event)->Queue = Queue;
3203
- (*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL;
3204
- (*Event)->ZeCommandList = ZeCommandList;
3214
+ (*Event)->Queue = Queue;
3215
+ (*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL;
3216
+ (*Event)->ZeCommandList = ZeCommandList;
3205
3217
3206
- ZeEvent = (*Event)->ZeEvent ;
3207
- }
3218
+ // Save the kernel in the event, so that when the event is signalled
3219
+ // the code can do a piKernelRelease on this kernel.
3220
+ (*Event)->CommandData = (void *)Kernel;
3221
+
3222
+ // Use piKernelRetain to increment the reference count and indicate
3223
+ // that the Kernel is in use. Once the event has been signalled, the
3224
+ // code in cleanupAfterEvent will do a piReleaseKernel to update
3225
+ // the reference count on the kernel, using the kernel saved
3226
+ // in CommandData.
3227
+ piKernelRetain (Kernel);
3228
+
3229
+ ZeEvent = (*Event)->ZeEvent ;
3208
3230
3209
3231
ze_event_handle_t *ZeEventWaitList =
3210
3232
_pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
@@ -3227,7 +3249,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
3227
3249
3228
3250
// Execute command list asynchronously, as the event will be used
3229
3251
// to track down its completion.
3230
- if (auto Res = Queue->batchCommandList (ZeCommandList, ZeFence))
3252
+ if (auto Res = Queue->executeCommandList (ZeCommandList, ZeFence, false , true ))
3231
3253
return Res;
3232
3254
3233
3255
_pi_event::deleteZeEventList (ZeEventWaitList);
@@ -3356,25 +3378,30 @@ pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName,
3356
3378
return PI_SUCCESS;
3357
3379
}
3358
3380
3359
- // Recycle the command list associated with this event.
3360
- static void recycleEventCommandList (pi_event Event) {
3381
+ // Perform any necessary cleanup after an event has been signalled.
3382
+ // This currently recycles the associate command list, and also makes
3383
+ // sure to release any kernel that may have been used by the event.
3384
+ static void cleanupAfterEvent (pi_event Event) {
3361
3385
// The implementation of this is slightly tricky. The same event
3362
3386
// can be referred to by multiple threads, so it is possible to
3363
- // have a race condition between the read of ZeCommandList and
3364
- // it being reset to nullptr in another thread.
3365
- // But, since the ZeCommandList is uniquely associated with the queue
3387
+ // have a race condition between the read of fields of the event,
3388
+ // and reseting those fields in some other thread.
3389
+ // But, since the event is uniquely associated with the queue
3366
3390
// for the event, we use the locking that we already have to do on the
3367
3391
// queue to also serve as the thread safety mechanism for the
3368
- // Event's ZeCommandList.
3392
+ // any of the Event's data members that need to be read/reset as
3393
+ // part of the cleanup operations.
3369
3394
auto Queue = Event->Queue ;
3370
3395
3371
3396
// Lock automatically releases when this goes out of scope.
3372
3397
std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
3373
3398
3399
+ // Cleanup the command list associated with the event if it hasn't
3400
+ // been cleaned up already.
3374
3401
auto EventCommandList = Event->ZeCommandList ;
3375
3402
3376
3403
if (EventCommandList) {
3377
- // Event has been signaled : If the fence for the associated command list
3404
+ // Event has been signalled : If the fence for the associated command list
3378
3405
// is signalled, then reset the fence and command list and add them to the
3379
3406
// available list for reuse in PI calls.
3380
3407
if (Queue->RefCount > 0 ) {
@@ -3386,6 +3413,13 @@ static void recycleEventCommandList(pi_event Event) {
3386
3413
}
3387
3414
}
3388
3415
}
3416
+
3417
+ // Release the kernel associated with this event if there is one.
3418
+ if (Event->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
3419
+ Event->CommandData ) {
3420
+ piKernelRelease (pi_cast<pi_kernel>(Event->CommandData ));
3421
+ Event->CommandData = nullptr ;
3422
+ }
3389
3423
}
3390
3424
3391
3425
pi_result piEventsWait (pi_uint32 NumEvents, const pi_event *EventList) {
@@ -3412,9 +3446,9 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
3412
3446
zePrint (" ZeEvent = %lx\n " , pi_cast<std::uintptr_t >(ZeEvent));
3413
3447
ZE_CALL (zeEventHostSynchronize (ZeEvent, UINT32_MAX));
3414
3448
3415
- // NOTE: we are destroying associated command lists here to free
3416
- // resources sooner in case RT is not calling piEventRelease soon enough.
3417
- recycleEventCommandList (EventList[I]);
3449
+ // NOTE: we are cleaning up after the event here to free resources
3450
+ // sooner in case run-time is not calling piEventRelease soon enough.
3451
+ cleanupAfterEvent (EventList[I]);
3418
3452
}
3419
3453
return PI_SUCCESS;
3420
3454
}
@@ -3441,7 +3475,7 @@ pi_result piEventRetain(pi_event Event) {
3441
3475
pi_result piEventRelease (pi_event Event) {
3442
3476
assert (Event);
3443
3477
if (--(Event->RefCount ) == 0 ) {
3444
- recycleEventCommandList (Event);
3478
+ cleanupAfterEvent (Event);
3445
3479
3446
3480
if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP &&
3447
3481
Event->CommandData ) {
0 commit comments