intel · pvchupin · Nov 10, 2022 · Oct 13, 2022 · Oct 17, 2022 · Oct 17, 2022
@@ -2988,6 +2988,11 @@ pi_result cuda_piEnqueueKernelLaunch(
   assert(work_dim > 0);
   assert(work_dim < 4);
 
+  if (*global_work_size == 0) {
+    return cuda_piEnqueueEventsWaitWithBarrier(
+        command_queue, num_events_in_wait_list, event_wait_list, event);
+  }
+
   // Set the number of threads per block to the number of threads per warp
   // by default unless user has provided a better number
   size_t threadsPerBlock[3] = {32u, 1u, 1u};

@@ -2758,6 +2758,11 @@ pi_result hip_piEnqueueKernelLaunch(
   assert(work_dim > 0);
   assert(work_dim < 4);
 
+  if (*global_work_size == 0) {
+    return hip_piEnqueueEventsWaitWithBarrier(
+        command_queue, num_events_in_wait_list, event_wait_list, event);
+  }
+
   // Set the number of threads per block to the number of threads per warp
   // by default unless user has provided a better number
   size_t threadsPerBlock[3] = {32u, 1u, 1u};

@@ -1934,7 +1934,7 @@ static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel,
   if (NDR.GlobalSize[0] != 0)
     return; // GlobalSize is set - no need to adjust
   // check the prerequisites:
-  assert(NDR.NumWorkGroups[0] != 0 && NDR.LocalSize[0] == 0);
+  assert(NDR.LocalSize[0] == 0);
   // TODO might be good to cache this info together with the kernel info to
   // avoid get_kernel_work_group_info on every kernel run
   range<3> WGSize = get_kernel_device_specific_info<