[SYCL][HIP] Fix infinite loop when parallel_for range exceeds INT_MAX (#5115)

npmiller · web-flow · commit fd0b108278e7 · 2021-12-17T10:59:30.000+03:00
This is the equivalent for HIP of the changes in #5095. It also fixes #4255 for the HIP plugin.
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
@@ -299,7 +299,7 @@ int getAttribute(pi_device device, hipDeviceAttribute_t attribute) {
 }
 /// \endcond
 
-void simpleGuessLocalWorkSize(int *threadsPerBlock,
+void simpleGuessLocalWorkSize(size_t *threadsPerBlock,
                               const size_t *global_work_size,
                               const size_t maxThreadsPerBlock[3],
                               pi_kernel kernel) {
@@ -314,8 +314,7 @@ void simpleGuessLocalWorkSize(int *threadsPerBlock,
 
   //(void)minGrid; // Not used, avoid warnings
 
-  threadsPerBlock[0] = std::min(static_cast<int>(maxThreadsPerBlock[0]),
-                                static_cast<int>(global_work_size[0]));
+  threadsPerBlock[0] = std::min(maxThreadsPerBlock[0], global_work_size[0]);
 
   // Find a local work group size that is a divisor of the global
   // work group size to produce uniform work groups.
@@ -2501,7 +2500,7 @@ pi_result hip_piEnqueueKernelLaunch(
 
   // Set the number of threads per block to the number of threads per warp
   // by default unless user has provided a better number
-  int threadsPerBlock[3] = {32, 1, 1};
+  size_t threadsPerBlock[3] = {32u, 1u, 1u};
   size_t maxWorkGroupSize = 0u;
   size_t maxThreadsPerBlock[3] = {};
   bool providedLocalWorkGroupSize = (local_work_size != nullptr);
@@ -2531,7 +2530,7 @@ pi_result hip_piEnqueueKernelLaunch(
           return PI_INVALID_WORK_GROUP_SIZE;
         if (0u != (global_work_size[dim] % local_work_size[dim]))
           return PI_INVALID_WORK_GROUP_SIZE;
-        threadsPerBlock[dim] = static_cast<int>(local_work_size[dim]);
+        threadsPerBlock[dim] = local_work_size[dim];
         return PI_SUCCESS;
       };
 
@@ -2551,12 +2550,11 @@ pi_result hip_piEnqueueKernelLaunch(
     return PI_INVALID_WORK_GROUP_SIZE;
   }
 
-  int blocksPerGrid[3] = {1, 1, 1};
+  size_t blocksPerGrid[3] = {1u, 1u, 1u};
 
   for (size_t i = 0; i < work_dim; i++) {
     blocksPerGrid[i] =
-        static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
-        threadsPerBlock[i];
+        (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
   }
 
   pi_result retError = PI_SUCCESS;