intel · sergey-semenov · Feb 2, 2023 · Jan 31, 2023
@@ -173,6 +173,10 @@ class kernel_impl {
 
   ProgramImplPtr getProgramImpl() const { return MProgramImpl; }
 
+  std::mutex &getNoncacheableEnqueueMutex() {
+    return MNoncacheableEnqueueMutex;
+  }
+
 private:
   RT::PiKernel MKernel;
   const ContextImplPtr MContext;
@@ -181,6 +185,7 @@ class kernel_impl {
   const DeviceImageImplPtr MDeviceImageImpl;
   const KernelBundleImplPtr MKernelBundleImpl;
   bool MIsInterop = false;
+  std::mutex MNoncacheableEnqueueMutex;
 };
 
 template <typename Param>

@@ -2106,12 +2106,7 @@ pi_int32 enqueueImpKernel(
   auto ContextImpl = Queue->getContextImplPtr();
   auto DeviceImpl = Queue->getDeviceImplPtr();
   RT::PiKernel Kernel = nullptr;
-  // Cacheable kernels use per-kernel mutexes that will be fetched from the
-  // cache, others (e.g. interoperability kernels) share a single mutex.
-  // TODO consider adding a PiKernel -> mutex map for allowing to enqueue
-  // different PiKernel's in parallel.
-  static std::mutex NoncacheableEnqueueMutex;
-  std::mutex *KernelMutex = &NoncacheableEnqueueMutex;
+  std::mutex *KernelMutex = nullptr;
   RT::PiProgram Program = nullptr;
 
   std::shared_ptr<kernel_impl> SyclKernelImpl;
@@ -2152,6 +2147,14 @@ pi_int32 enqueueImpKernel(
               OSModuleHandle, ContextImpl, DeviceImpl, KernelName,
               SyclProg.get());
       assert(FoundKernel == Kernel);
+    } else {
+      // Non-cacheable kernels use mutexes from kernel_impls.
+      // TODO this can still result in a race condition if multiple SYCL
+      // kernels are created with the same native handle. To address this,
+      // we need to either store and use a pi_native_handle -> mutex map or
+      // reuse and return existing SYCL kernels from make_native to avoid
+      // their duplication in such cases.
+      KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
     }
   } else {
     std::tie(Kernel, KernelMutex, Program) =