Skip to content

Commit 2a76b2a

Browse files
[SYCL] Retain PI events until they have signaled (#3350)
It is illegal to destroy an event until it is completed in the GPU. This change prevents sporadic failures due to Level-Zero RT crashes because we released event too early. I am adding E2E test: intel/llvm-test-suite#180 Signed-off-by: Sergey V Maslov <[email protected]>
1 parent d3aeb4a commit 2a76b2a

File tree

2 files changed

+28
-0
lines changed

2 files changed

+28
-0
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,15 @@ createEventAndAssociateQueue(pi_queue Queue, pi_event *Event,
462462
// In piEventRelease, the reference counter of the Queue is decremented
463463
// to release it.
464464
piQueueRetainNoLock(Queue);
465+
466+
// SYCL RT does not track completion of the events, so it could
467+
// release a PI event as soon as that's not being waited in the app.
468+
// But we have to ensure that the event is not destroyed before
469+
// it is really signalled, so retain it explicitly here and
470+
// release in cleanupAfterEvent.
471+
//
472+
PI_CALL(piEventRetain(*Event));
473+
465474
return PI_SUCCESS;
466475
}
467476

@@ -3848,6 +3857,15 @@ static pi_result cleanupAfterEvent(pi_event Event) {
38483857
PI_CALL(piKernelRelease(pi_cast<pi_kernel>(Event->CommandData)));
38493858
Event->CommandData = nullptr;
38503859
}
3860+
3861+
if (!Event->CleanedUp) {
3862+
Event->CleanedUp = true;
3863+
// Release this event since we explicitly retained it on creation.
3864+
// NOTE: that this needs to be done only once for an event so
3865+
// this is guarded with the CleanedUp flag.
3866+
//
3867+
PI_CALL(piEventRelease(Event));
3868+
}
38513869
}
38523870

38533871
// Make a list of all the dependent events that must have signalled
@@ -3871,6 +3889,7 @@ static pi_result cleanupAfterEvent(pi_event Event) {
38713889
EventsToBeReleased);
38723890
PI_CALL(piEventRelease(DepEvent));
38733891
}
3892+
38743893
return PI_SUCCESS;
38753894
}
38763895

@@ -3932,6 +3951,9 @@ pi_result piEventRetain(pi_event Event) {
39323951

39333952
pi_result piEventRelease(pi_event Event) {
39343953
PI_ASSERT(Event, PI_INVALID_EVENT);
3954+
if (!Event->RefCount) {
3955+
die("piEventRelease: called on a destroyed event");
3956+
}
39353957

39363958
if (--(Event->RefCount) == 0) {
39373959
cleanupAfterEvent(Event);

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,12 @@ struct _pi_event : _pi_object {
584584
// enqueued, and must then be released when this event has signalled.
585585
// This list must be destroyed once the event has signalled.
586586
_pi_ze_event_list_t WaitList;
587+
588+
// Tracks if the needed cleanupAfterEvent was already performed for
589+
// a completed event. This allows to control that some cleanup
590+
// actions are performed only once.
591+
//
592+
bool CleanedUp = {false};
587593
};
588594

589595
struct _pi_program : _pi_object {

0 commit comments

Comments
 (0)