Skip to content

Commit 1ef4cd6

Browse files
committed
[SYCL][CUDA] Remove pi Event Callback implementation
Since introduction of host tasks in #1471, `piEventCallback` and related functionality is not required by the SYCL-RT. Removing the implementation of this behaviour from the CUDA backend simplifies the submission of operations to streams and overall increases performance. Signed-off-by: Ruyman Reyes <[email protected]>
1 parent ae3fd5c commit 1ef4cd6

File tree

2 files changed

+36
-219
lines changed

2 files changed

+36
-219
lines changed

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 29 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -276,13 +276,15 @@ _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
276276
isStarted_{false}, evEnd_{nullptr}, evStart_{nullptr}, evQueued_{nullptr},
277277
queue_{queue}, context_{context} {
278278

279-
if (is_native_event()) {
279+
if (type != PI_COMMAND_TYPE_USER) {
280280
PI_CHECK_ERROR(cuEventCreate(&evEnd_, CU_EVENT_DEFAULT));
281281

282282
if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
283283
PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
284284
PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
285285
}
286+
} else {
287+
cl::sycl::detail::pi::die("User-defined events not implemented");
286288
}
287289

288290
if (queue_ != nullptr) {
@@ -303,7 +305,7 @@ pi_result _pi_event::start() {
303305
pi_result result;
304306

305307
try {
306-
if (is_native_event() && queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
308+
if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
307309
// NOTE: This relies on the default stream to be unused.
308310
result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
309311
result = PI_CHECK_ERROR(cuEventRecord(evStart_, queue_->get()));
@@ -313,8 +315,6 @@ pi_result _pi_event::start() {
313315
}
314316

315317
isStarted_ = true;
316-
// let observers know that the event is "submitted"
317-
trigger_callback(get_execution_status());
318318
return result;
319319
}
320320

@@ -351,37 +351,16 @@ pi_result _pi_event::record() {
351351

352352
pi_result result = PI_INVALID_OPERATION;
353353

354-
if (is_native_event()) {
355-
356-
if (!queue_) {
357-
return PI_INVALID_QUEUE;
358-
}
354+
if (!queue_) {
355+
return PI_INVALID_QUEUE;
356+
}
359357

360-
CUstream cuStream = queue_->get();
358+
CUstream cuStream = queue_->get();
361359

362-
try {
363-
result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
364-
365-
result = cuda_piEventRetain(this);
366-
try {
367-
result = PI_CHECK_ERROR(cuLaunchHostFunc(
368-
cuStream,
369-
[](void *userData) {
370-
pi_event event = reinterpret_cast<pi_event>(userData);
371-
event->set_event_complete();
372-
cuda_piEventRelease(event);
373-
},
374-
this));
375-
} catch (...) {
376-
// If host function fails to enqueue we must release the event here
377-
result = cuda_piEventRelease(this);
378-
throw;
379-
}
380-
} catch (pi_result error) {
381-
result = error;
382-
}
383-
} else {
384-
result = PI_SUCCESS;
360+
try {
361+
result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
362+
} catch (pi_result error) {
363+
result = error;
385364
}
386365

387366
if (result == PI_SUCCESS) {
@@ -392,65 +371,23 @@ pi_result _pi_event::record() {
392371
}
393372

394373
pi_result _pi_event::wait() {
395-
396374
pi_result retErr;
397-
if (is_native_event()) {
398-
try {
399-
retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
400-
isCompleted_ = true;
401-
} catch (pi_result error) {
402-
retErr = error;
403-
}
404-
} else {
405-
406-
while (!is_completed()) {
407-
// wait for user event to complete
408-
}
409-
retErr = PI_SUCCESS;
375+
try {
376+
retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
377+
isCompleted_ = true;
378+
} catch (pi_result error) {
379+
retErr = error;
410380
}
411381

412-
auto is_success = retErr == PI_SUCCESS;
413-
auto status = is_success ? get_execution_status() : pi_int32(retErr);
414-
415-
trigger_callback(status);
416-
417382
return retErr;
418383
}
419384

420385
// makes all future work submitted to queue wait for all work captured in event.
421386
pi_result enqueueEventWait(pi_queue queue, pi_event event) {
422-
if (event->is_native_event()) {
423-
424-
// for native events, the cuStreamWaitEvent call is used.
425-
// This makes all future work submitted to stream wait for all
426-
// work captured in event.
427-
428-
return PI_CHECK_ERROR(cuStreamWaitEvent(queue->get(), event->get(), 0));
429-
430-
} else {
431-
432-
// for user events, we enqueue a callback. When invoked, the
433-
// callback will block until the user event is marked as
434-
// completed.
435-
436-
static auto user_wait_func = [](void *user_data) {
437-
// The host function must not make any CUDA API calls.
438-
auto event = static_cast<pi_event>(user_data);
439-
440-
// busy wait for user event to complete
441-
event->wait();
442-
443-
// this function does not need the event to be kept alive
444-
// anymore
445-
cuda_piEventRelease(event);
446-
};
447-
448-
// retain event to ensure it is still alive when the
449-
// user_wait_func callback is invoked
450-
cuda_piEventRetain(event);
451-
452-
return PI_CHECK_ERROR(cuLaunchHostFunc(queue->get(), user_wait_func, event));
453-
}
387+
// for native events, the cuStreamWaitEvent call is used.
388+
// This makes all future work submitted to stream wait for all
389+
// work captured in event.
390+
return PI_CHECK_ERROR(cuStreamWaitEvent(queue->get(), event->get(), 0));
454391
}
455392

456393
_pi_program::_pi_program(pi_context ctxt)
@@ -2763,37 +2700,13 @@ pi_result cuda_piEventSetCallback(pi_event event,
27632700
pi_int32 command_exec_callback_type,
27642701
pfn_notify notify, void *user_data) {
27652702

2766-
assert(event);
2767-
assert(notify);
2768-
assert(command_exec_callback_type == PI_EVENT_SUBMITTED ||
2769-
command_exec_callback_type == PI_EVENT_RUNNING ||
2770-
command_exec_callback_type == PI_EVENT_COMPLETE);
2771-
event_callback callback(pi_event_status(command_exec_callback_type), notify,
2772-
user_data);
2773-
2774-
event->set_event_callback(callback);
2775-
2703+
cl::sycl::detail::pi::die("Event Callback not implemented");
27762704
return PI_SUCCESS;
27772705
}
27782706

27792707
pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {
27802708

2781-
assert(execution_status >= PI_EVENT_COMPLETE &&
2782-
execution_status <= PI_EVENT_QUEUED);
2783-
2784-
if (!event || event->is_native_event()) {
2785-
return PI_INVALID_EVENT;
2786-
}
2787-
2788-
if (execution_status == PI_EVENT_COMPLETE) {
2789-
return event->set_event_complete();
2790-
} else if (execution_status < 0) {
2791-
// TODO: A negative integer value causes all enqueued commands that wait
2792-
// on this user event to be terminated.
2793-
cl::sycl::detail::pi::die("cuda_piEventSetStatus support for negative execution_status not "
2794-
"implemented.");
2795-
}
2796-
2709+
cl::sycl::detail::pi::die("Event Set Status not implemented");
27972710
return PI_INVALID_VALUE;
27982711
}
27992712

@@ -2821,19 +2734,13 @@ pi_result cuda_piEventRelease(pi_event event) {
28212734
if (event->decrement_reference_count() == 0) {
28222735
std::unique_ptr<_pi_event> event_ptr{event};
28232736
pi_result result = PI_INVALID_EVENT;
2824-
2825-
if (event->is_native_event()) {
2826-
try {
2827-
ScopedContext active(event->get_context());
2828-
auto cuEvent = event->get();
2829-
result = PI_CHECK_ERROR(cuEventDestroy(cuEvent));
2830-
} catch (...) {
2831-
result = PI_OUT_OF_RESOURCES;
2832-
}
2833-
} else {
2834-
result = PI_SUCCESS;
2737+
try {
2738+
ScopedContext active(event->get_context());
2739+
auto cuEvent = event->get();
2740+
result = PI_CHECK_ERROR(cuEventDestroy(cuEvent));
2741+
} catch (...) {
2742+
result = PI_OUT_OF_RESOURCES;
28352743
}
2836-
28372744
return result;
28382745
}
28392746

@@ -2888,9 +2795,6 @@ pi_result cuda_piEnqueueEventsWait(pi_queue command_queue,
28882795
/// \return PI_SUCCESS on success. PI_INVALID_EVENT if given a user event.
28892796
pi_result cuda_piextEventGetNativeHandle(pi_event event,
28902797
pi_native_handle *nativeHandle) {
2891-
if (event->is_user_event()) {
2892-
return PI_INVALID_EVENT;
2893-
}
28942798
*nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
28952799
return PI_SUCCESS;
28962800
}

sycl/plugins/cuda/pi_cuda.hpp

Lines changed: 7 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -302,37 +302,6 @@ struct _pi_queue {
302302

303303
typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
304304
void *userData);
305-
306-
class event_callback {
307-
public:
308-
void trigger_callback(pi_event event, pi_int32 currentEventStatus) const {
309-
310-
auto validParameters = callback_ && event;
311-
312-
// As a pi_event_status value approaches 0, it gets closer to completion.
313-
// If the calling pi_event's status is less than or equal to the event
314-
// status the user is interested in, invoke the callback anyway. The event
315-
// will have passed through that state anyway.
316-
auto validStatus = currentEventStatus <= observedEventStatus_;
317-
318-
if (validParameters && validStatus) {
319-
320-
callback_(event, currentEventStatus, userData_);
321-
}
322-
}
323-
324-
event_callback(pi_event_status status, pfn_notify callback, void *userData)
325-
: observedEventStatus_{status}, callback_{callback}, userData_{userData} {
326-
}
327-
328-
pi_event_status get_status() const noexcept { return observedEventStatus_; }
329-
330-
private:
331-
pi_event_status observedEventStatus_;
332-
pfn_notify callback_;
333-
void *userData_;
334-
};
335-
336305
/// PI Event mapping to CUevent
337306
///
338307
class _pi_event {
@@ -347,41 +316,6 @@ class _pi_event {
347316

348317
native_type get() const noexcept { return evEnd_; };
349318

350-
pi_result set_event_complete() noexcept {
351-
352-
if (isCompleted_) {
353-
return PI_INVALID_OPERATION;
354-
}
355-
356-
isRecorded_ = true;
357-
isCompleted_ = true;
358-
359-
trigger_callback(get_execution_status());
360-
361-
return PI_SUCCESS;
362-
}
363-
364-
void trigger_callback(pi_int32 status) {
365-
366-
std::vector<event_callback> callbacks;
367-
368-
// Here we move all callbacks into local variable before we call them.
369-
// This is a defensive maneuver; if any of the callbacks attempt to
370-
// add additional callbacks, we will end up in a bad spot. Our mutex
371-
// will be locked twice and the vector will be modified as it is being
372-
// iterated over! By moving everything locally, we can call all of these
373-
// callbacks and let them modify the original vector without much worry.
374-
375-
{
376-
std::lock_guard<std::mutex> lock(mutex_);
377-
event_callbacks_.swap(callbacks);
378-
}
379-
380-
for (auto &event_callback : callbacks) {
381-
event_callback.trigger_callback(this, status);
382-
}
383-
}
384-
385319
pi_queue get_queue() const noexcept { return queue_; }
386320

387321
pi_command_type get_command_type() const noexcept { return commandType_; }
@@ -390,10 +324,10 @@ class _pi_event {
390324

391325
bool is_recorded() const noexcept { return isRecorded_; }
392326

393-
bool is_completed() const noexcept { return isCompleted_; }
394-
395327
bool is_started() const noexcept { return isStarted_; }
396328

329+
bool is_completed() const noexcept { return isCompleted_; };
330+
397331
pi_int32 get_execution_status() const noexcept {
398332

399333
if (!is_recorded()) {
@@ -406,24 +340,8 @@ class _pi_event {
406340
return PI_EVENT_COMPLETE;
407341
}
408342

409-
void set_event_callback(const event_callback &callback) {
410-
auto current_status = get_execution_status();
411-
if (current_status <= callback.get_status()) {
412-
callback.trigger_callback(this, current_status);
413-
} else {
414-
std::lock_guard<std::mutex> lock(mutex_);
415-
event_callbacks_.emplace_back(callback);
416-
}
417-
}
418-
419343
pi_context get_context() const noexcept { return context_; };
420344

421-
bool is_user_event() const noexcept {
422-
return get_command_type() == PI_COMMAND_TYPE_USER;
423-
}
424-
425-
bool is_native_event() const noexcept { return !is_user_event(); }
426-
427345
pi_uint32 increment_reference_count() { return ++refCount_; }
428346

429347
pi_uint32 decrement_reference_count() { return --refCount_; }
@@ -462,13 +380,14 @@ class _pi_event {
462380

463381
std::atomic_uint32_t refCount_; // Event reference count.
464382

465-
std::atomic_bool isCompleted_; // Atomic bool used by user events. Can be
466-
// used to wait for a user event's completion.
383+
bool isCompleted_; // Signifies whether the operations have completed
384+
//
467385

468386
bool isRecorded_; // Signifies wether a native CUDA event has been recorded
469387
// yet.
470-
bool isStarted_; // Signifies wether the operation associated with the
471-
// PI event has started or not
388+
bool isStarted_; // Signifies wether the operation associated with the
389+
// PI event has started or not
390+
//
472391

473392
native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
474393
// event, this will be nullptr.
@@ -484,12 +403,6 @@ class _pi_event {
484403
pi_context context_; // pi_context associated with the event. If this is a
485404
// native event, this will be the same context associated
486405
// with the queue_ member.
487-
488-
std::mutex mutex_; // Protect access to event_callbacks_. TODO: There might be
489-
// a lock-free data structure we can use here.
490-
std::vector<event_callback>
491-
event_callbacks_; // Callbacks that can be triggered when an event's state
492-
// changes.
493406
};
494407

495408
/// Implementation of PI Program on CUDA Module object

0 commit comments

Comments
 (0)