Skip to content

Commit 24b7bc3

Browse files
authored
[UR][CUDA][HIP] Unify queue handling between adapters (#17641)
The CUDA and HIP adapters are both using a nearly identical complicated queue that handles creating an out-of-order UR queue from in-order CUDA/HIP streams. This patch extracts all of the queue logic into a separate templated class that can be used by both adapters. Beyond removing a lot of duplicated code, it also makes it a lot easier to maintain. There was a few functional differences between the queues in both adapters, but mostly due to fixes done in the CUDA adapter that were not ported to the HIP adapter. There might be more but I found at least one race condition (#15100) and one performance issue (#6333) that weren't fixed in the HIP adapter. This patch uses the CUDA version of the queue as a base for the generic queue, and will thus fix for HIP the race condition and performance issue mentioned above. This code is quite complex, so this patch also aimed to minimize any other changes beyond the structural changes needed to share the code. However it did do the following changes in the two adapters: `stream_queue.hpp`: * Remove `urDeviceRetain/Release`: essentially a no-op CUDA: * Rename `ur_stream_guard_` to `ur_stream_guard` * Rename `getNextEventID` to `getNextEventId` * Remove duplicate `get_device` getter, use `getDevice` instead HIP: * Fix queue finish so it doesn't fail when no streams need to be synchronized
1 parent c73f76f commit 24b7bc3

File tree

10 files changed

+401
-648
lines changed

10 files changed

+401
-648
lines changed

.github/CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ llvm/include/llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h @intel/llvm-reviewers-
7575
llvm/lib/SYCLLowerIR/SYCLCreateNVVMAnnotations.cpp @intel/llvm-reviewers-cuda
7676
llvm/lib/Target/NVPTX @intel/llvm-reviewers-cuda
7777
llvm/lib/Target/AMDGPU @intel/llvm-reviewers-cuda
78+
unified-runtime/source/common/cuda-hip @intel/llvm-reviewers-cuda
7879

7980
# XPTI instrumentation utilities
8081
xpti/ @intel/llvm-reviewers-runtime

unified-runtime/source/adapters/cuda/async_alloc.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMDeviceAllocExp(
2525

2626
ScopedContext Active(hQueue->getDevice());
2727
uint32_t StreamToken;
28-
ur_stream_guard_ Guard;
28+
ur_stream_guard Guard;
2929
CUstream CuStream = hQueue->getNextComputeStream(
3030
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
3131

@@ -83,7 +83,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFreeExp(
8383

8484
ScopedContext Active(hQueue->getDevice());
8585
uint32_t StreamToken;
86-
ur_stream_guard_ Guard;
86+
ur_stream_guard Guard;
8787
CUstream CuStream = hQueue->getNextComputeStream(
8888
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
8989

unified-runtime/source/adapters/cuda/command_buffer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1133,7 +1133,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp(
11331133
std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
11341134
ScopedContext Active(hQueue->getDevice());
11351135
uint32_t StreamToken;
1136-
ur_stream_guard_ Guard;
1136+
ur_stream_guard Guard;
11371137
CUstream CuStream = hQueue->getNextComputeStream(
11381138
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
11391139

unified-runtime/source/adapters/cuda/enqueue.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
315315
try {
316316
ScopedContext Active(hQueue->getDevice());
317317
uint32_t StreamToken;
318-
ur_stream_guard_ Guard;
318+
ur_stream_guard Guard;
319319
CUstream CuStream = hQueue->getNextComputeStream(
320320
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
321321
{
@@ -440,7 +440,7 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
440440

441441
ScopedContext Active(hQueue->getDevice());
442442
uint32_t StreamToken;
443-
ur_stream_guard_ Guard;
443+
ur_stream_guard Guard;
444444
CUstream CuStream = hQueue->getNextComputeStream(
445445
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
446446

@@ -628,7 +628,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
628628

629629
ScopedContext Active(hQueue->getDevice());
630630
uint32_t StreamToken;
631-
ur_stream_guard_ Guard;
631+
ur_stream_guard Guard;
632632
CUstream CuStream = hQueue->getNextComputeStream(
633633
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
634634

@@ -1517,7 +1517,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
15171517
try {
15181518
ScopedContext Active(hQueue->getDevice());
15191519
uint32_t StreamToken;
1520-
ur_stream_guard_ Guard;
1520+
ur_stream_guard Guard;
15211521
CUstream CuStream = hQueue->getNextComputeStream(
15221522
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
15231523
UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,

unified-runtime/source/adapters/cuda/event.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,17 +87,17 @@ bool ur_event_handle_t_::isCompleted() const noexcept try {
8787

8888
uint64_t ur_event_handle_t_::getQueuedTime() const {
8989
assert(isStarted());
90-
return Queue->get_device()->getElapsedTime(EvQueued);
90+
return Queue->getDevice()->getElapsedTime(EvQueued);
9191
}
9292

9393
uint64_t ur_event_handle_t_::getStartTime() const {
9494
assert(isStarted());
95-
return Queue->get_device()->getElapsedTime(EvStart);
95+
return Queue->getDevice()->getElapsedTime(EvStart);
9696
}
9797

9898
uint64_t ur_event_handle_t_::getEndTime() const {
9999
assert(isStarted() && isRecorded());
100-
return Queue->get_device()->getElapsedTime(EvEnd);
100+
return Queue->getDevice()->getElapsedTime(EvEnd);
101101
}
102102

103103
ur_result_t ur_event_handle_t_::record() {
@@ -111,7 +111,7 @@ ur_result_t ur_event_handle_t_::record() {
111111
UR_ASSERT(Queue, UR_RESULT_ERROR_INVALID_QUEUE);
112112

113113
try {
114-
EventID = Queue->getNextEventID();
114+
EventID = Queue->getNextEventId();
115115
if (EventID == 0) {
116116
die("Unrecoverable program state reached in event identifier overflow");
117117
}

unified-runtime/source/adapters/cuda/queue.cpp

Lines changed: 7 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -32,93 +32,17 @@ void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded(
3232
}
3333
}
3434

35-
CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) {
36-
if (getThreadLocalStream() != CUstream{0})
37-
return getThreadLocalStream();
38-
uint32_t StreamI;
39-
uint32_t Token;
40-
while (true) {
41-
if (NumComputeStreams < ComputeStreams.size()) {
42-
// the check above is for performance - so as not to lock mutex every time
43-
std::lock_guard<std::mutex> guard(ComputeStreamMutex);
44-
// The second check is done after mutex is locked so other threads can not
45-
// change NumComputeStreams after that
46-
if (NumComputeStreams < ComputeStreams.size()) {
47-
UR_CHECK_ERROR(cuStreamCreateWithPriority(
48-
&ComputeStreams[NumComputeStreams], Flags, Priority));
49-
++NumComputeStreams;
50-
}
51-
}
52-
Token = ComputeStreamIndex++;
53-
StreamI = Token % ComputeStreams.size();
54-
// if a stream has been reused before it was next selected round-robin
55-
// fashion, we want to delay its next use and instead select another one
56-
// that is more likely to have completed all the enqueued work.
57-
if (DelayCompute[StreamI]) {
58-
DelayCompute[StreamI] = false;
59-
} else {
60-
break;
61-
}
62-
}
63-
if (StreamToken) {
64-
*StreamToken = Token;
65-
}
66-
CUstream res = ComputeStreams[StreamI];
67-
computeStreamWaitForBarrierIfNeeded(res, StreamI);
68-
return res;
35+
ur_queue_handle_t ur_queue_handle_t_::getEventQueue(const ur_event_handle_t e) {
36+
return e->getQueue();
6937
}
7038

71-
CUstream ur_queue_handle_t_::getNextComputeStream(
72-
uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
73-
ur_stream_guard_ &Guard, uint32_t *StreamToken) {
74-
if (getThreadLocalStream() != CUstream{0})
75-
return getThreadLocalStream();
76-
for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
77-
uint32_t Token = EventWaitList[i]->getComputeStreamToken();
78-
if (reinterpret_cast<ur_queue_handle_t>(EventWaitList[i]->getQueue()) ==
79-
this &&
80-
canReuseStream(Token)) {
81-
std::unique_lock<std::mutex> ComputeSyncGuard(ComputeStreamSyncMutex);
82-
// redo the check after lock to avoid data races on
83-
// LastSyncComputeStreams
84-
if (canReuseStream(Token)) {
85-
uint32_t StreamI = Token % DelayCompute.size();
86-
DelayCompute[StreamI] = true;
87-
if (StreamToken) {
88-
*StreamToken = Token;
89-
}
90-
Guard = ur_stream_guard_{std::move(ComputeSyncGuard)};
91-
CUstream Result = EventWaitList[i]->getStream();
92-
computeStreamWaitForBarrierIfNeeded(Result, StreamI);
93-
return Result;
94-
}
95-
}
96-
}
97-
Guard = {};
98-
return getNextComputeStream(StreamToken);
39+
uint32_t
40+
ur_queue_handle_t_::getEventComputeStreamToken(const ur_event_handle_t e) {
41+
return e->getComputeStreamToken();
9942
}
10043

101-
CUstream ur_queue_handle_t_::getNextTransferStream() {
102-
if (getThreadLocalStream() != CUstream{0})
103-
return getThreadLocalStream();
104-
if (TransferStreams.empty()) { // for example in in-order queue
105-
return getNextComputeStream();
106-
}
107-
if (NumTransferStreams < TransferStreams.size()) {
108-
// the check above is for performance - so as not to lock mutex every time
109-
std::lock_guard<std::mutex> Guuard(TransferStreamMutex);
110-
// The second check is done after mutex is locked so other threads can not
111-
// change NumTransferStreams after that
112-
if (NumTransferStreams < TransferStreams.size()) {
113-
UR_CHECK_ERROR(cuStreamCreateWithPriority(
114-
&TransferStreams[NumTransferStreams], Flags, Priority));
115-
++NumTransferStreams;
116-
}
117-
}
118-
uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size();
119-
CUstream Result = TransferStreams[StreamI];
120-
transferStreamWaitForBarrierIfNeeded(Result, StreamI);
121-
return Result;
44+
CUstream ur_queue_handle_t_::getEventStream(const ur_event_handle_t e) {
45+
return e->getStream();
12246
}
12347

12448
/// Creates a `ur_queue_handle_t` object on the CUDA backend.

0 commit comments

Comments
 (0)