Skip to content

Commit d7ee359

Browse files
[SYCL][CUDA] Event synchronization only done for latest events (#1995)
CUDA streams operate in-order, so when waiting for a list of events we introduce unnecessary overhead. These changes makes the PI CUDA backend only wait for the latest event for each stream in a given list of events. Signed-off-by: Steffen Larsen <[email protected]>
1 parent 8c8137f commit d7ee359

File tree

2 files changed

+50
-23
lines changed

2 files changed

+50
-23
lines changed

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 42 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -64,22 +64,41 @@ inline void assign_result(pi_result *ptr, pi_result value) noexcept {
6464
}
6565

6666
// Iterates over the event wait list, returns correct pi_result error codes.
67-
// Invokes the callback for each event in the wait list. The callback must take
68-
// a single pi_event argument and return a pi_result.
67+
// Invokes the callback for the latest event of each queue in the wait list.
68+
// The callback must take a single pi_event argument and return a pi_result.
6969
template <typename Func>
70-
pi_result forEachEvent(const pi_event *event_wait_list,
71-
std::size_t num_events_in_wait_list, Func &&f) {
70+
pi_result forLatestEvents(const pi_event *event_wait_list,
71+
std::size_t num_events_in_wait_list, Func &&f) {
7272

7373
if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
7474
return PI_INVALID_EVENT_WAIT_LIST;
7575
}
7676

77-
for (size_t i = 0; i < num_events_in_wait_list; i++) {
78-
auto event = event_wait_list[i];
79-
if (event == nullptr) {
80-
return PI_INVALID_EVENT_WAIT_LIST;
77+
// Fast path if we only have a single event
78+
if (num_events_in_wait_list == 1) {
79+
return f(event_wait_list[0]);
80+
}
81+
82+
std::vector<pi_event> events{event_wait_list,
83+
event_wait_list + num_events_in_wait_list};
84+
std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) {
85+
// Tiered sort creating sublists of streams (smallest value first) in which
86+
// the corresponding events are sorted into a sequence of newest first.
87+
return e0->get_queue()->stream_ < e1->get_queue()->stream_ ||
88+
(e0->get_queue()->stream_ == e1->get_queue()->stream_ &&
89+
e0->get_event_id() > e1->get_event_id());
90+
});
91+
92+
bool first = true;
93+
CUstream lastSeenStream = 0;
94+
for (pi_event event : events) {
95+
if (!event || (!first && event->get_queue()->stream_ == lastSeenStream)) {
96+
continue;
8197
}
8298

99+
first = false;
100+
lastSeenStream = event->get_queue()->stream_;
101+
83102
auto result = f(event);
84103
if (result != PI_SUCCESS) {
85104
return result;
@@ -357,6 +376,11 @@ pi_result _pi_event::record() {
357376
CUstream cuStream = queue_->get();
358377

359378
try {
379+
eventId_ = queue_->get_next_event_id();
380+
if (eventId_ == 0) {
381+
cl::sycl::detail::pi::die(
382+
"Unrecoverable program state reached in event identifier overflow");
383+
}
360384
result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
361385
} catch (pi_result error) {
362386
result = error;
@@ -1961,8 +1985,8 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
19611985
pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
19621986

19631987
try {
1964-
pi_result err = PI_SUCCESS;
1965-
1988+
assert(num_events != 0);
1989+
assert(event_list);
19661990
if (num_events == 0) {
19671991
return PI_INVALID_VALUE;
19681992
}
@@ -1974,11 +1998,7 @@ pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
19741998
auto context = event_list[0]->get_context();
19751999
ScopedContext active(context);
19762000

1977-
for (pi_uint32 count = 0; count < num_events && (err == PI_SUCCESS);
1978-
count++) {
1979-
1980-
auto event = event_list[count];
1981-
2001+
auto waitFunc = [context](pi_event event) -> pi_result {
19822002
if (!event) {
19832003
return PI_INVALID_EVENT;
19842004
}
@@ -1987,9 +2007,9 @@ pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
19872007
return PI_INVALID_CONTEXT;
19882008
}
19892009

1990-
err = event->wait();
1991-
}
1992-
return err;
2010+
return event->wait();
2011+
};
2012+
return forLatestEvents(event_list, num_events, waitFunc);
19932013
} catch (pi_result err) {
19942014
return err;
19952015
} catch (...) {
@@ -2763,10 +2783,10 @@ pi_result cuda_piEnqueueEventsWait(pi_queue command_queue,
27632783

27642784
if (event_wait_list) {
27652785
auto result =
2766-
forEachEvent(event_wait_list, num_events_in_wait_list,
2767-
[command_queue](pi_event event) -> pi_result {
2768-
return enqueueEventWait(command_queue, event);
2769-
});
2786+
forLatestEvents(event_wait_list, num_events_in_wait_list,
2787+
[command_queue](pi_event event) -> pi_result {
2788+
return enqueueEventWait(command_queue, event);
2789+
});
27702790

27712791
if (result != PI_SUCCESS) {
27722792
return result;

sycl/plugins/cuda/pi_cuda.hpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,11 +281,12 @@ struct _pi_queue {
281281
_pi_device *device_;
282282
pi_queue_properties properties_;
283283
std::atomic_uint32_t refCount_;
284+
std::atomic_uint32_t eventCount_;
284285

285286
_pi_queue(CUstream stream, _pi_context *context, _pi_device *device,
286287
pi_queue_properties properties)
287288
: stream_{stream}, context_{context}, device_{device},
288-
properties_{properties}, refCount_{1} {
289+
properties_{properties}, refCount_{1}, eventCount_{0} {
289290
cuda_piContextRetain(context_);
290291
cuda_piDeviceRetain(device_);
291292
}
@@ -304,6 +305,8 @@ struct _pi_queue {
304305
pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
305306

306307
pi_uint32 get_reference_count() const noexcept { return refCount_; }
308+
309+
pi_uint32 get_next_event_id() noexcept { return ++eventCount_; }
307310
};
308311

309312
typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
@@ -352,6 +355,8 @@ class _pi_event {
352355

353356
pi_uint32 decrement_reference_count() { return --refCount_; }
354357

358+
pi_uint32 get_event_id() const noexcept { return eventId_; }
359+
355360
// Returns the counter time when the associated command(s) were enqueued
356361
//
357362
pi_uint64 get_queued_time() const;
@@ -389,6 +394,8 @@ class _pi_event {
389394
// PI event has started or not
390395
//
391396

397+
pi_uint32 eventId_; // Queue identifier of the event.
398+
392399
native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
393400
// event, this will be nullptr.
394401

0 commit comments

Comments
 (0)