Skip to content

Commit 5baf374

Browse files
author
Steffen Larsen
committed
[SYCL][CUDA] Event callback worker implementation
Introduces a worker thread living in the platform for handling callbacks for native events. This is needed as host functions enqueued by cuLaunchHostFunc is strictly prohibited from doing any CUDA operations, but as a SYCL object, such as the event itself, may end its life in the callback we must be able to destroy the CUDA object. Signed-off-by: Steffen Larsen <[email protected]>
1 parent dbf1c80 commit 5baf374

File tree

4 files changed

+79
-5
lines changed

4 files changed

+79
-5
lines changed

sycl/plugins/cuda/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ message(STATUS "Including the PI API CUDA backend.")
44
# we only require the CUDA driver API to be used
55
# CUDA_CUDA_LIBRARY variable defines the path to libcuda.so, the CUDA Driver API library.
66

7+
find_package(Threads REQUIRED)
78
find_package(CUDA 10.0 REQUIRED)
89

910
add_library(cudadrv SHARED IMPORTED)
@@ -33,7 +34,7 @@ target_include_directories(pi_cuda PRIVATE "${sycl_inc_dir}")
3334

3435
target_include_directories(pi_cuda INTERFACE ${CUDA_INCLUDE_DIRS})
3536

36-
target_link_libraries(pi_cuda PUBLIC OpenCL-Headers cudadrv)
37+
target_link_libraries(pi_cuda PUBLIC OpenCL-Headers cudadrv Threads::Threads)
3738

3839
target_link_libraries(sycl INTERFACE pi_cuda)
3940

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,31 @@ pi_result cuda_piEventRetain(pi_event event);
9797

9898
} // extern "C"
9999

100+
void worker::execute() {
101+
bool Terminate = false;
102+
while (!Terminate) {
103+
std::unique_lock<std::mutex> lock(workQueueGateMutex_);
104+
workQueueGate_.wait(lock);
105+
while (!workQueue_.empty()) {
106+
work item = workQueue_.front();
107+
workQueue_.pop();
108+
switch (item.kind_) {
109+
case work::kind::complete_event:
110+
complete_event(static_cast<pi_event>(item.content_));
111+
break;
112+
case work::kind::terminate:
113+
Terminate = true;
114+
break;
115+
}
116+
}
117+
}
118+
}
119+
120+
void worker::complete_event(pi_event event) {
121+
event->set_event_complete();
122+
cuda_piEventRelease(event);
123+
}
124+
100125
_pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
101126
: commandType_{type}, refCount_{1}, isCompleted_{false},
102127
isRecorded_{false},
@@ -174,8 +199,9 @@ pi_result _pi_event::record() {
174199
cuStream,
175200
[](void *userData) {
176201
pi_event event = reinterpret_cast<pi_event>(userData);
177-
event->set_event_complete();
178-
cuda_piEventRelease(event);
202+
pi_platform platform =
203+
event->get_context()->get_device()->platform_;
204+
platform->worker_.enqueue_complete_event(event);
179205
},
180206
this));
181207
} catch (...) {

sycl/plugins/cuda/pi_cuda.hpp

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,18 @@
1717
#include <array>
1818
#include <atomic>
1919
#include <cassert>
20+
#include <condition_variable>
2021
#include <cstring>
2122
#include <cuda.h>
23+
#include <functional>
2224
#include <limits>
25+
#include <mutex>
2326
#include <numeric>
27+
#include <queue>
2428
#include <stdint.h>
2529
#include <string>
30+
#include <thread>
2631
#include <vector>
27-
#include <functional>
28-
#include <mutex>
2932

3033
extern "C" {
3134

@@ -45,8 +48,48 @@ pi_result cuda_piKernelRelease(pi_kernel);
4548

4649
}
4750

51+
class worker {
52+
public:
53+
worker()
54+
: workQueue_{}, workQueueGateMutex_{}, workQueueGate_{},
55+
workerThread_{&worker::execute, this} {}
56+
57+
~worker() {
58+
enqueue_work(work{work::kind::terminate, nullptr});
59+
workerThread_.join();
60+
}
61+
62+
void enqueue_complete_event(pi_event event) {
63+
enqueue_work(work{work::kind::complete_event, event});
64+
}
65+
66+
private:
67+
struct work {
68+
enum kind { complete_event, terminate } kind_;
69+
void *content_;
70+
71+
work(kind k, void *c) : kind_{k}, content_{c} {}
72+
};
73+
74+
void enqueue_work(work w) {
75+
std::unique_lock<std::mutex> lock(workQueueGateMutex_);
76+
workQueue_.push(w);
77+
workQueueGate_.notify_one();
78+
}
79+
80+
void complete_event(pi_event event);
81+
82+
void execute();
83+
84+
std::queue<work> workQueue_;
85+
std::mutex workQueueGateMutex_;
86+
std::condition_variable workQueueGate_;
87+
std::thread workerThread_;
88+
};
89+
4890
struct _pi_platform {
4991
std::vector<std::unique_ptr<_pi_device>> devices_;
92+
worker worker_;
5093
};
5194

5295
struct _pi_device {

sycl/source/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,16 @@ function(add_sycl_rt_library LIB_NAME)
2727
set_target_properties(${LIB_NAME} PROPERTIES LINK_DEPENDS ${linker_script})
2828
endif()
2929

30+
find_package(Threads REQUIRED)
31+
set_property(TARGET ${LIB_NAME} PROPERTY LINK_WHAT_YOU_USE TRUE)
32+
3033
target_include_directories(
3134
${LIB_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} "${sycl_inc_dir}")
3235
target_link_libraries(${LIB_NAME}
3336
PRIVATE OpenCL::Headers
3437
PRIVATE ${OpenCL_LIBRARIES}
3538
PRIVATE ${CMAKE_DL_LIBS}
39+
PRIVATE Threads::Threads
3640
)
3741

3842
add_common_options(${LIB_NAME})

0 commit comments

Comments
 (0)