intel
diff --git a/‎sycl/plugins/cuda/pi_cuda.cpp
Lines changed: 62 additions & 35 deletions b/‎sycl/plugins/cuda/pi_cuda.cpp
Lines changed: 62 additions & 35 deletions
@@ -2267,9 +2267,71 @@ pi_result cuda_piEnqueueKernelLaunch(
   assert(command_queue != nullptr);
   assert(command_queue->get_context() == kernel->get_context());
   assert(kernel != nullptr);
+  assert(global_work_offset != nullptr);
   assert(work_dim > 0);
   assert(work_dim < 4);
 
+  // Set the number of threads per block to the number of threads per warp
+  // by default unless user has provided a better number
+  int threadsPerBlock[3] = {32, 1, 1};
+
+  {
+    size_t maxThreadsPerBlock[3] = {};
+    pi_result retError = cuda_piDeviceGetInfo(
+        command_queue->device_, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
+        sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
+    assert(retError == PI_SUCCESS);
+    (void)retError;
+    size_t maxWorkGroupSize = 0;
+    retError = cuda_piDeviceGetInfo(
+        command_queue->device_, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
+        sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
+    assert(retError == PI_SUCCESS);
+
+    if (local_work_size) {
+      for (size_t i = 0; i < work_dim; i++) {
+        if (local_work_size[i] > maxThreadsPerBlock[i])
+          return PI_INVALID_WORK_ITEM_SIZE;
+        // Checks that local work sizes are a divisor of the global work sizes
+        // which includes that the local work sizes are neither larger than the
+        // global work sizes and not 0.
+        if (0u == local_work_size[i])
+          return PI_INVALID_WORK_GROUP_SIZE;
+        if (0u != (global_work_size[i] % local_work_size[i]))
+          return PI_INVALID_WORK_GROUP_SIZE;
+        threadsPerBlock[i] = static_cast<int>(local_work_size[i]);
+      }
+      if (maxWorkGroupSize < size_t(threadsPerBlock[0] * threadsPerBlock[1] *
+                                    threadsPerBlock[2])) {
+        return PI_INVALID_WORK_GROUP_SIZE;
+      }
+    } else {
+      // Determine local work sizes that result in uniform work groups.
+      // The default threadsPerBlock only require handling the first work_dim
+      // dimension.
+      threadsPerBlock[0] =
+          std::min(static_cast<int>(maxThreadsPerBlock[0]),
+                   std::min(static_cast<int>(global_work_size[0]),
+                            static_cast<int>(threadsPerBlock[0])));
+      // Find a local work group size that is a divisor of the global
+      // work group size to produce uniform work groups.
+      while (0u != (global_work_size[0] % threadsPerBlock[0])) {
+        --threadsPerBlock[0];
+      }
+      assert(
+          maxWorkGroupSize >=
+          size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2]));
+    }
+  }
+
+  int blocksPerGrid[3] = {1, 1, 1};
+
+  for (size_t i = 0; i < work_dim; i++) {
+    blocksPerGrid[i] =
+        static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
+        threadsPerBlock[i];
+  }
+
   pi_result retError = PI_SUCCESS;
   std::unique_ptr<_pi_event> retImplEv{nullptr};
 
@@ -2297,41 +2359,6 @@ pi_result cuda_piEnqueueKernelLaunch(
                                       cuda_implicit_offset);
     }
 
-    // Set the number of threads per block to the number of threads per warp
-    // by default unless user has provided a better number
-    int threadsPerBlock[3] = {32, 1, 1};
-
-    if (local_work_size) {
-      for (size_t i = 0; i < work_dim; i++) {
-        threadsPerBlock[i] = static_cast<int>(local_work_size[i]);
-      }
-    } else {
-      for (size_t i = 0; i < work_dim; i++) {
-        threadsPerBlock[i] = std::min(static_cast<int>(global_work_size[i]),
-                                      static_cast<int>(threadsPerBlock[i]));
-      }
-    }
-
-    size_t maxThreadsPerBlock[3] = {};
-    retError = cuda_piDeviceGetInfo(
-        command_queue->device_, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
-        sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
-    assert(retError == PI_SUCCESS);
-
-    for (size_t i = 0; i < work_dim; i++) {
-      if (size_t(threadsPerBlock[i]) > maxThreadsPerBlock[i]) {
-        return PI_INVALID_WORK_GROUP_SIZE;
-      }
-    }
-
-    int blocksPerGrid[3] = {1, 1, 1};
-
-    for (size_t i = 0; i < work_dim; i++) {
-      blocksPerGrid[i] =
-          static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
-          threadsPerBlock[i];
-    }
-
     auto argIndices = kernel->get_arg_indices();
 
     if (event) {