intel · pvchupin · Jun 15, 2022 · Jun 10, 2022 · Jun 13, 2022 · Jun 13, 2022
@@ -131,7 +131,7 @@ template <> struct InteropFeatureSupportMap<backend::ext_oneapi_cuda> {
   static constexpr bool MakePlatform = false;
   static constexpr bool MakeDevice = true;
   static constexpr bool MakeContext = true;
-  static constexpr bool MakeQueue = false;
+  static constexpr bool MakeQueue = true;
   static constexpr bool MakeEvent = false;
   static constexpr bool MakeBuffer = false;
   static constexpr bool MakeKernel = false;

@@ -71,5 +71,15 @@ inline device make_device<backend::ext_oneapi_cuda>(
   return ext::oneapi::cuda::make_device(NativeHandle);
 }
 
+// CUDA queue specialization
+template <>
+inline queue make_queue<backend::ext_oneapi_cuda>(
+    const backend_input_t<backend::ext_oneapi_cuda, queue> &BackendObject,
+    const context &TargetContext, const async_handler Handler) {
+  return detail::make_queue(detail::pi::cast<pi_native_handle>(BackendObject),
+                            TargetContext, true, Handler,
+                            /*Backend*/ backend::ext_oneapi_cuda);
+}
+
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
@@ -2437,6 +2437,9 @@ pi_result cuda_piQueueRelease(pi_queue command_queue) {
   try {
     std::unique_ptr<_pi_queue> queueImpl(command_queue);
 
+    if (!command_queue->backend_has_ownership())
+      return PI_SUCCESS;
+
     ScopedContext active(command_queue->get_context());
 
     command_queue->for_each_stream([](CUstream s) {
@@ -2500,8 +2503,7 @@ pi_result cuda_piextQueueGetNativeHandle(pi_queue queue,
 }
 
 /// Created a PI queue object from a CUDA queue handle.
-/// TODO: Implement this.
-/// NOTE: The created PI object takes ownership of the native handle.
+/// NOTE: The created PI object does not take ownership of the native handle.
 ///
 /// \param[in] nativeHandle The native handle to create PI queue object from.
 /// \param[in] context is the PI context of the queue.
@@ -2510,13 +2512,43 @@ pi_result cuda_piextQueueGetNativeHandle(pi_queue queue,
 ///        the native handle, if it can.
 ///
 /// \return TBD
-pi_result cuda_piextQueueCreateWithNativeHandle(pi_native_handle, pi_context,
-                                                pi_device, bool ownNativeHandle,
-                                                pi_queue *) {
+pi_result cuda_piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                                pi_context context,
+                                                pi_device device,
+                                                bool ownNativeHandle,
+                                                pi_queue *queue) {
+  (void)device;
   (void)ownNativeHandle;
-  cl::sycl::detail::pi::die(
-      "Creation of PI queue from native handle not implemented");
-  return {};
+  assert(ownNativeHandle == false);
+
+  unsigned int flags;
+  CUstream cuStream = reinterpret_cast<CUstream>(nativeHandle);
+
+  auto retErr = PI_CHECK_ERROR(cuStreamGetFlags(cuStream, &flags));
+
+  pi_queue_properties properties = 0;
+  if (flags == CU_STREAM_DEFAULT)
+    properties = __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
+  else if (flags == CU_STREAM_NON_BLOCKING)
+    properties = __SYCL_PI_CUDA_SYNC_WITH_DEFAULT;
+  else
+    cl::sycl::detail::pi::die("Unknown cuda stream");
+
+  std::vector<CUstream> computeCuStreams(1, cuStream);
+  std::vector<CUstream> transferCuStreams(0);
+
+  // Create queue and set num_compute_streams to 1, as computeCuStreams has
+  // valid stream
+  *queue = new _pi_queue{std::move(computeCuStreams),
+                         std::move(transferCuStreams),
+                         context,
+                         context->get_device(),
+                         properties,
+                         flags,
+                         /*backend_owns*/ false};
+  (*queue)->num_compute_streams_ = 1;
+
+  return retErr;
 }
 
 pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer,

@@ -401,18 +401,19 @@ struct _pi_queue {
   unsigned int flags_;
   std::mutex compute_stream_mutex_;
   std::mutex transfer_stream_mutex_;
+  bool has_ownership_;
 
   _pi_queue(std::vector<CUstream> &&compute_streams,
             std::vector<CUstream> &&transfer_streams, _pi_context *context,
             _pi_device *device, pi_queue_properties properties,
-            unsigned int flags)
+            unsigned int flags, bool backend_owns = true)
       : compute_streams_{std::move(compute_streams)},
         transfer_streams_{std::move(transfer_streams)}, context_{context},
         device_{device}, properties_{properties}, refCount_{1}, eventCount_{0},
         compute_stream_idx_{0}, transfer_stream_idx_{0},
         num_compute_streams_{0}, num_transfer_streams_{0},
         last_sync_compute_streams_{0}, last_sync_transfer_streams_{0},
-        flags_(flags) {
+        flags_(flags), has_ownership_{backend_owns} {
     cuda_piContextRetain(context_);
     cuda_piDeviceRetain(device_);
   }
@@ -513,6 +514,8 @@ struct _pi_queue {
   pi_uint32 get_reference_count() const noexcept { return refCount_; }
 
   pi_uint32 get_next_event_id() noexcept { return ++eventCount_; }
+
+  bool backend_has_ownership() const noexcept { return has_ownership_; }
 };
 
 typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,

@@ -40,6 +40,7 @@ int main() {
 
   backend_traits<backend::ext_oneapi_cuda>::return_type<device> cu_device;
   backend_traits<backend::ext_oneapi_cuda>::return_type<context> cu_context;
+  backend_traits<backend::ext_oneapi_cuda>::return_type<queue> cu_queue;
 
   // 4.5.1.2 For each SYCL runtime class T which supports SYCL application
   // interoperability, a specialization of get_native must be defined, which
@@ -50,6 +51,7 @@ int main() {
 
   cu_device = get_native<backend::ext_oneapi_cuda>(Device);
   cu_context = get_native<backend::ext_oneapi_cuda>(Context);
+  cu_queue = get_native<backend::ext_oneapi_cuda>(Queue);
 
   // Check deprecated
   // expected-warning@+2 {{'get_native' is deprecated: Use SYCL 2020 sycl::get_native free function}}
@@ -58,6 +60,9 @@ int main() {
   // expected-warning@+2 {{'get_native' is deprecated: Use SYCL 2020 sycl::get_native free function}}
   // expected-warning@+1 {{'get_native<sycl::backend::ext_oneapi_cuda>' is deprecated: Use SYCL 2020 sycl::get_native free function}}
   cu_context = Context.get_native<backend::ext_oneapi_cuda>();
+  // expected-warning@+2 {{'get_native' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  // expected-warning@+1 {{'get_native<sycl::backend::ext_oneapi_cuda>' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  cu_queue = Queue.get_native<backend::ext_oneapi_cuda>();
 
   // 4.5.1.1 For each SYCL runtime class T which supports SYCL application
   // interoperability with the SYCL backend, a specialization of input_type must
@@ -85,5 +90,7 @@ int main() {
   context InteropContext =
       make_context<backend::ext_oneapi_cuda>(InteropContextInput);
 
+  queue InteropQueue = make_queue<backend::ext_oneapi_cuda>(cu_queue, Context);
+
   return 0;
 }