Skip to content

[SYCL][CUDA] Add experimental cuda interop with queue #6290

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ template <> struct InteropFeatureSupportMap<backend::ext_oneapi_cuda> {
static constexpr bool MakePlatform = false;
static constexpr bool MakeDevice = true;
static constexpr bool MakeContext = true;
static constexpr bool MakeQueue = false;
static constexpr bool MakeQueue = true;
static constexpr bool MakeEvent = false;
static constexpr bool MakeBuffer = false;
static constexpr bool MakeKernel = false;
Expand Down
10 changes: 10 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,5 +71,15 @@ inline device make_device<backend::ext_oneapi_cuda>(
return ext::oneapi::cuda::make_device(NativeHandle);
}

// CUDA queue specialization
template <>
inline queue make_queue<backend::ext_oneapi_cuda>(
const backend_input_t<backend::ext_oneapi_cuda, queue> &BackendObject,
const context &TargetContext, const async_handler Handler) {
return detail::make_queue(detail::pi::cast<pi_native_handle>(BackendObject),
TargetContext, true, Handler,
/*Backend*/ backend::ext_oneapi_cuda);
}

} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
48 changes: 40 additions & 8 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2437,6 +2437,9 @@ pi_result cuda_piQueueRelease(pi_queue command_queue) {
try {
std::unique_ptr<_pi_queue> queueImpl(command_queue);

if (!command_queue->backend_has_ownership())
return PI_SUCCESS;

ScopedContext active(command_queue->get_context());

command_queue->for_each_stream([](CUstream s) {
Expand Down Expand Up @@ -2500,8 +2503,7 @@ pi_result cuda_piextQueueGetNativeHandle(pi_queue queue,
}

/// Created a PI queue object from a CUDA queue handle.
/// TODO: Implement this.
/// NOTE: The created PI object takes ownership of the native handle.
/// NOTE: The created PI object does not take ownership of the native handle.
///
/// \param[in] nativeHandle The native handle to create PI queue object from.
/// \param[in] context is the PI context of the queue.
Expand All @@ -2510,13 +2512,43 @@ pi_result cuda_piextQueueGetNativeHandle(pi_queue queue,
/// the native handle, if it can.
///
/// \return TBD
pi_result cuda_piextQueueCreateWithNativeHandle(pi_native_handle, pi_context,
pi_device, bool ownNativeHandle,
pi_queue *) {
pi_result cuda_piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle,
pi_context context,
pi_device device,
bool ownNativeHandle,
pi_queue *queue) {
(void)device;
(void)ownNativeHandle;
cl::sycl::detail::pi::die(
"Creation of PI queue from native handle not implemented");
return {};
assert(ownNativeHandle == false);

unsigned int flags;
CUstream cuStream = reinterpret_cast<CUstream>(nativeHandle);

auto retErr = PI_CHECK_ERROR(cuStreamGetFlags(cuStream, &flags));

pi_queue_properties properties = 0;
if (flags == CU_STREAM_DEFAULT)
properties = __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
else if (flags == CU_STREAM_NON_BLOCKING)
properties = __SYCL_PI_CUDA_SYNC_WITH_DEFAULT;
else
cl::sycl::detail::pi::die("Unknown cuda stream");

std::vector<CUstream> computeCuStreams(1, cuStream);
std::vector<CUstream> transferCuStreams(0);

// Create queue and set num_compute_streams to 1, as computeCuStreams has
// valid stream
*queue = new _pi_queue{std::move(computeCuStreams),
std::move(transferCuStreams),
context,
context->get_device(),
properties,
flags,
/*backend_owns*/ false};
(*queue)->num_compute_streams_ = 1;

return retErr;
}

pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer,
Expand Down
7 changes: 5 additions & 2 deletions sycl/plugins/cuda/pi_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,18 +401,19 @@ struct _pi_queue {
unsigned int flags_;
std::mutex compute_stream_mutex_;
std::mutex transfer_stream_mutex_;
bool has_ownership_;

_pi_queue(std::vector<CUstream> &&compute_streams,
std::vector<CUstream> &&transfer_streams, _pi_context *context,
_pi_device *device, pi_queue_properties properties,
unsigned int flags)
unsigned int flags, bool backend_owns = true)
: compute_streams_{std::move(compute_streams)},
transfer_streams_{std::move(transfer_streams)}, context_{context},
device_{device}, properties_{properties}, refCount_{1}, eventCount_{0},
compute_stream_idx_{0}, transfer_stream_idx_{0},
num_compute_streams_{0}, num_transfer_streams_{0},
last_sync_compute_streams_{0}, last_sync_transfer_streams_{0},
flags_(flags) {
flags_(flags), has_ownership_{backend_owns} {
cuda_piContextRetain(context_);
cuda_piDeviceRetain(device_);
}
Expand Down Expand Up @@ -513,6 +514,8 @@ struct _pi_queue {
pi_uint32 get_reference_count() const noexcept { return refCount_; }

pi_uint32 get_next_event_id() noexcept { return ++eventCount_; }

bool backend_has_ownership() const noexcept { return has_ownership_; }
};

typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
Expand Down
7 changes: 7 additions & 0 deletions sycl/test/basic_tests/interop-cuda-experimental.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ int main() {

backend_traits<backend::ext_oneapi_cuda>::return_type<device> cu_device;
backend_traits<backend::ext_oneapi_cuda>::return_type<context> cu_context;
backend_traits<backend::ext_oneapi_cuda>::return_type<queue> cu_queue;

// 4.5.1.2 For each SYCL runtime class T which supports SYCL application
// interoperability, a specialization of get_native must be defined, which
Expand All @@ -50,6 +51,7 @@ int main() {

cu_device = get_native<backend::ext_oneapi_cuda>(Device);
cu_context = get_native<backend::ext_oneapi_cuda>(Context);
cu_queue = get_native<backend::ext_oneapi_cuda>(Queue);

// Check deprecated
// expected-warning@+2 {{'get_native' is deprecated: Use SYCL 2020 sycl::get_native free function}}
Expand All @@ -58,6 +60,9 @@ int main() {
// expected-warning@+2 {{'get_native' is deprecated: Use SYCL 2020 sycl::get_native free function}}
// expected-warning@+1 {{'get_native<sycl::backend::ext_oneapi_cuda>' is deprecated: Use SYCL 2020 sycl::get_native free function}}
cu_context = Context.get_native<backend::ext_oneapi_cuda>();
// expected-warning@+2 {{'get_native' is deprecated: Use SYCL 2020 sycl::get_native free function}}
// expected-warning@+1 {{'get_native<sycl::backend::ext_oneapi_cuda>' is deprecated: Use SYCL 2020 sycl::get_native free function}}
cu_queue = Queue.get_native<backend::ext_oneapi_cuda>();

// 4.5.1.1 For each SYCL runtime class T which supports SYCL application
// interoperability with the SYCL backend, a specialization of input_type must
Expand Down Expand Up @@ -85,5 +90,7 @@ int main() {
context InteropContext =
make_context<backend::ext_oneapi_cuda>(InteropContextInput);

queue InteropQueue = make_queue<backend::ext_oneapi_cuda>(cu_queue, Context);

return 0;
}