28
28
29
29
#include " usm_allocator.hpp"
30
30
31
+ extern " C" {
32
+ // Forward declarartions.
33
+ static pi_result EventRelease (pi_event Event, pi_queue LockedQueue);
34
+ static pi_result QueueRelease (pi_queue Queue, pi_queue LockedQueue);
35
+ }
36
+
31
37
namespace {
32
38
33
39
// Controls Level Zero calls serialization to w/a Level Zero driver being not MT
@@ -533,6 +539,7 @@ createEventAndAssociateQueue(pi_queue Queue, pi_event *Event,
533
539
if (CommandList != Queue->CommandListMap .end ()) {
534
540
(*Event)->ZeCommandList = CommandList->first ;
535
541
CommandList->second .append (*Event);
542
+ PI_CALL (piEventRetain (*Event));
536
543
} else {
537
544
(*Event)->ZeCommandList = nullptr ;
538
545
}
@@ -548,7 +555,7 @@ createEventAndAssociateQueue(pi_queue Queue, pi_event *Event,
548
555
// release a PI event as soon as that's not being waited in the app.
549
556
// But we have to ensure that the event is not destroyed before
550
557
// it is really signalled, so retain it explicitly here and
551
- // release in cleanupAfterEvent .
558
+ // release in Event->cleanup() .
552
559
//
553
560
PI_CALL (piEventRetain (*Event));
554
561
@@ -706,7 +713,17 @@ pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,
706
713
ZE_CALL (zeFenceReset, (CommandList->second .ZeFence ));
707
714
ZE_CALL (zeCommandListReset, (CommandList->first ));
708
715
CommandList->second .InUse = false ;
709
- CommandList->second .EventList .clear ();
716
+
717
+ // Finally release/cleanup all the events in this command list.
718
+ auto &EventList = CommandList->second .EventList ;
719
+ for (auto &Event : EventList) {
720
+ if (!Event->CleanedUp ) {
721
+ ZE_CALL (zeHostSynchronize, (Event->ZeEvent ));
722
+ Event->cleanup (this );
723
+ }
724
+ PI_CALL (EventRelease (Event, this ));
725
+ }
726
+ EventList.clear ();
710
727
711
728
if (MakeAvailable) {
712
729
std::lock_guard<std::mutex> lock (this ->Context ->ZeCommandListCacheMutex );
@@ -2634,13 +2651,20 @@ pi_result piQueueRetain(pi_queue Queue) {
2634
2651
}
2635
2652
2636
2653
pi_result piQueueRelease (pi_queue Queue) {
2654
+ return QueueRelease (Queue, nullptr );
2655
+ }
2656
+
2657
+ static pi_result QueueRelease (pi_queue Queue, pi_queue LockedQueue) {
2637
2658
PI_ASSERT (Queue, PI_INVALID_QUEUE);
2638
2659
// We need to use a bool variable here to check the condition that
2639
2660
// RefCount becomes zero atomically with PiQueueMutex lock.
2640
2661
// Then, we can release the lock before we remove the Queue below.
2641
2662
bool RefCountZero = false ;
2642
2663
{
2643
- std::lock_guard<std::mutex> Lock (Queue->PiQueueMutex );
2664
+ auto Lock = ((Queue == LockedQueue)
2665
+ ? std::unique_lock<std::mutex>()
2666
+ : std::unique_lock<std::mutex>(Queue->PiQueueMutex ));
2667
+
2644
2668
Queue->RefCount --;
2645
2669
if (Queue->RefCount == 0 )
2646
2670
RefCountZero = true ;
@@ -4049,7 +4073,7 @@ pi_result piKernelRelease(pi_kernel Kernel) {
4049
4073
PI_ASSERT (Kernel, PI_INVALID_KERNEL);
4050
4074
4051
4075
if (IndirectAccessTrackingEnabled) {
4052
- // piKernelRelease is called by cleanupAfterEvent as soon as kernel
4076
+ // piKernelRelease is called by Event->cleanup() as soon as kernel
4053
4077
// execution has finished. This is the place where we need to release memory
4054
4078
// allocations. If kernel is not in use (not submitted by some other thread)
4055
4079
// then release referenced memory allocations. As a result, memory can be
@@ -4199,7 +4223,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
4199
4223
4200
4224
// Use piKernelRetain to increment the reference count and indicate
4201
4225
// that the Kernel is in use. Once the event has been signalled, the
4202
- // code in cleanupAfterEvent will do a piReleaseKernel to update
4226
+ // code in Event.cleanup() will do a piReleaseKernel to update
4203
4227
// the reference count on the kernel, using the kernel saved
4204
4228
// in CommandData.
4205
4229
PI_CALL (piKernelRetain (Kernel));
@@ -4391,7 +4415,7 @@ pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName,
4391
4415
// Perform any necessary cleanup after an event has been signalled.
4392
4416
// This currently recycles the associate command list, and also makes
4393
4417
// sure to release any kernel that may have been used by the event.
4394
- static pi_result cleanupAfterEvent (pi_event Event ) {
4418
+ pi_result _pi_event::cleanup (pi_queue LockedQueue ) {
4395
4419
// The implementation of this is slightly tricky. The same event
4396
4420
// can be referred to by multiple threads, so it is possible to
4397
4421
// have a race condition between the read of fields of the event,
@@ -4401,21 +4425,18 @@ static pi_result cleanupAfterEvent(pi_event Event) {
4401
4425
// queue to also serve as the thread safety mechanism for the
4402
4426
// any of the Event's data members that need to be read/reset as
4403
4427
// part of the cleanup operations.
4404
- auto Queue = Event->Queue ;
4405
4428
if (Queue) {
4406
4429
// Lock automatically releases when this goes out of scope.
4407
- std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
4430
+ auto Lock = ((Queue == LockedQueue)
4431
+ ? std::unique_lock<std::mutex>()
4432
+ : std::unique_lock<std::mutex>(Queue->PiQueueMutex ));
4408
4433
4409
- // Cleanup the command list associated with the event if it hasn't
4410
- // been cleaned up already.
4411
- auto EventCommandList = Event->ZeCommandList ;
4412
-
4413
- if (EventCommandList) {
4434
+ if (ZeCommandList) {
4414
4435
// Event has been signalled: If the fence for the associated command list
4415
4436
// is signalled, then reset the fence and command list and add them to the
4416
4437
// available list for reuse in PI calls.
4417
4438
if (Queue->RefCount > 0 ) {
4418
- auto it = Queue->CommandListMap .find (EventCommandList );
4439
+ auto it = Queue->CommandListMap .find (ZeCommandList );
4419
4440
if (it == Queue->CommandListMap .end ()) {
4420
4441
die (" Missing command-list completition fence" );
4421
4442
}
@@ -4436,42 +4457,41 @@ static pi_result cleanupAfterEvent(pi_event Event) {
4436
4457
// too, so we might need to add a new command type to differentiate.
4437
4458
//
4438
4459
ze_result_t ZeResult =
4439
- (Event-> CommandType == PI_COMMAND_TYPE_MEM_BUFFER_COPY)
4460
+ (CommandType == PI_COMMAND_TYPE_MEM_BUFFER_COPY)
4440
4461
? ZE_CALL_NOCHECK (zeHostSynchronize, (it->second .ZeFence ))
4441
4462
: ZE_CALL_NOCHECK (zeFenceQueryStatus, (it->second .ZeFence ));
4442
4463
4443
4464
if (ZeResult == ZE_RESULT_SUCCESS) {
4444
4465
Queue->resetCommandList (it, true );
4445
- Event-> ZeCommandList = nullptr ;
4466
+ ZeCommandList = nullptr ;
4446
4467
}
4447
4468
}
4448
4469
}
4449
4470
}
4450
4471
4451
4472
// Release the kernel associated with this event if there is one.
4452
- if (Event->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
4453
- Event->CommandData ) {
4454
- PI_CALL (piKernelRelease (pi_cast<pi_kernel>(Event->CommandData )));
4455
- Event->CommandData = nullptr ;
4473
+ if (CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL && CommandData) {
4474
+ PI_CALL (piKernelRelease (pi_cast<pi_kernel>(CommandData)));
4475
+ CommandData = nullptr ;
4456
4476
}
4457
4477
4458
4478
// If this event was the LastCommandEvent in the queue, being used
4459
4479
// to make sure that commands were executed in-order, remove this.
4460
4480
// If we don't do this, the event can get released and freed leaving
4461
4481
// a dangling pointer to this event. It could also cause unneeded
4462
4482
// already finished events to show up in the wait list.
4463
- if (Queue->LastCommandEvent == Event ) {
4483
+ if (Queue->LastCommandEvent == this ) {
4464
4484
Queue->LastCommandEvent = nullptr ;
4465
4485
}
4466
4486
}
4467
4487
4468
- if (!Event-> CleanedUp ) {
4469
- Event-> CleanedUp = true ;
4488
+ if (!CleanedUp) {
4489
+ CleanedUp = true ;
4470
4490
// Release this event since we explicitly retained it on creation.
4471
4491
// NOTE: that this needs to be done only once for an event so
4472
4492
// this is guarded with the CleanedUp flag.
4473
4493
//
4474
- PI_CALL (piEventRelease (Event ));
4494
+ PI_CALL (EventRelease ( this , LockedQueue ));
4475
4495
}
4476
4496
4477
4497
// Make a list of all the dependent events that must have signalled
@@ -4484,8 +4504,7 @@ static pi_result cleanupAfterEvent(pi_event Event) {
4484
4504
4485
4505
std::list<pi_event> EventsToBeReleased;
4486
4506
4487
- Event->WaitList .collectEventsForReleaseAndDestroyPiZeEventList (
4488
- EventsToBeReleased);
4507
+ WaitList.collectEventsForReleaseAndDestroyPiZeEventList (EventsToBeReleased);
4489
4508
4490
4509
while (!EventsToBeReleased.empty ()) {
4491
4510
pi_event DepEvent = EventsToBeReleased.front ();
@@ -4499,14 +4518,18 @@ static pi_result cleanupAfterEvent(pi_event Event) {
4499
4518
// twice, so it is safe. Lock automatically releases when this goes out of
4500
4519
// scope.
4501
4520
// TODO: this code needs to be moved out of the guard.
4502
- std::lock_guard<std::mutex> lock (DepEvent->Queue ->PiQueueMutex );
4521
+ auto Lock =
4522
+ ((DepEvent->Queue == LockedQueue)
4523
+ ? std::unique_lock<std::mutex>()
4524
+ : std::unique_lock<std::mutex>(DepEvent->Queue ->PiQueueMutex ));
4525
+
4503
4526
if (DepEvent->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
4504
4527
DepEvent->CommandData ) {
4505
4528
PI_CALL (piKernelRelease (pi_cast<pi_kernel>(DepEvent->CommandData )));
4506
4529
DepEvent->CommandData = nullptr ;
4507
4530
}
4508
4531
}
4509
- PI_CALL (piEventRelease (DepEvent));
4532
+ PI_CALL (EventRelease (DepEvent, LockedQueue ));
4510
4533
}
4511
4534
4512
4535
return PI_SUCCESS;
@@ -4539,7 +4562,7 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
4539
4562
4540
4563
// NOTE: we are cleaning up after the event here to free resources
4541
4564
// sooner in case run-time is not calling piEventRelease soon enough.
4542
- cleanupAfterEvent ( EventList[I]);
4565
+ EventList[I]-> cleanup ( );
4543
4566
}
4544
4567
4545
4568
return PI_SUCCESS;
@@ -4571,26 +4594,18 @@ pi_result piEventRetain(pi_event Event) {
4571
4594
}
4572
4595
4573
4596
pi_result piEventRelease (pi_event Event) {
4597
+ return EventRelease (Event, nullptr );
4598
+ }
4599
+
4600
+ static pi_result EventRelease (pi_event Event, pi_queue LockedQueue) {
4574
4601
PI_ASSERT (Event, PI_INVALID_EVENT);
4575
4602
if (!Event->RefCount ) {
4576
4603
die (" piEventRelease: called on a destroyed event" );
4577
4604
}
4578
4605
4579
- // The event is no longer needed upstream, but we have to wait for its
4580
- // completion in order to do proper cleanup. Otherwise refcount may still be
4581
- // non-zero in the check below and we will get event leak.
4582
- //
4583
- // TODO: in case this potentially "early" wait causes performance problems,
4584
- // e.g. due to closing a batch too early, or blocking the host for no good
4585
- // reason, then we should look into moving the wait down to queue release
4586
- // (will need to remember all events in the queue for that).
4587
- //
4588
- if (!Event->CleanedUp )
4589
- PI_CALL (piEventsWait (1 , &Event));
4590
-
4591
4606
if (--(Event->RefCount ) == 0 ) {
4592
4607
if (!Event->CleanedUp )
4593
- cleanupAfterEvent ( Event);
4608
+ Event-> cleanup (LockedQueue );
4594
4609
4595
4610
if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP &&
4596
4611
Event->CommandData ) {
@@ -4617,7 +4632,7 @@ pi_result piEventRelease(pi_event Event) {
4617
4632
// pi_event is released. Here we have to decrement it so pi_queue
4618
4633
// can be released successfully.
4619
4634
if (Event->Queue ) {
4620
- PI_CALL (piQueueRelease (Event->Queue ));
4635
+ PI_CALL (QueueRelease (Event->Queue , LockedQueue ));
4621
4636
}
4622
4637
delete Event;
4623
4638
}
0 commit comments