@@ -299,7 +299,7 @@ int getAttribute(pi_device device, hipDeviceAttribute_t attribute) {
299
299
}
300
300
// / \endcond
301
301
302
- void simpleGuessLocalWorkSize (int *threadsPerBlock,
302
+ void simpleGuessLocalWorkSize (size_t *threadsPerBlock,
303
303
const size_t *global_work_size,
304
304
const size_t maxThreadsPerBlock[3 ],
305
305
pi_kernel kernel) {
@@ -314,8 +314,7 @@ void simpleGuessLocalWorkSize(int *threadsPerBlock,
314
314
315
315
// (void)minGrid; // Not used, avoid warnings
316
316
317
- threadsPerBlock[0 ] = std::min (static_cast <int >(maxThreadsPerBlock[0 ]),
318
- static_cast <int >(global_work_size[0 ]));
317
+ threadsPerBlock[0 ] = std::min (maxThreadsPerBlock[0 ], global_work_size[0 ]);
319
318
320
319
// Find a local work group size that is a divisor of the global
321
320
// work group size to produce uniform work groups.
@@ -2492,7 +2491,7 @@ pi_result hip_piEnqueueKernelLaunch(
2492
2491
2493
2492
// Set the number of threads per block to the number of threads per warp
2494
2493
// by default unless user has provided a better number
2495
- int threadsPerBlock[3 ] = {32 , 1 , 1 };
2494
+ size_t threadsPerBlock[3 ] = {32u , 1u , 1u };
2496
2495
size_t maxWorkGroupSize = 0u ;
2497
2496
size_t maxThreadsPerBlock[3 ] = {};
2498
2497
bool providedLocalWorkGroupSize = (local_work_size != nullptr );
@@ -2522,7 +2521,7 @@ pi_result hip_piEnqueueKernelLaunch(
2522
2521
return PI_INVALID_WORK_GROUP_SIZE;
2523
2522
if (0u != (global_work_size[dim] % local_work_size[dim]))
2524
2523
return PI_INVALID_WORK_GROUP_SIZE;
2525
- threadsPerBlock[dim] = static_cast < int >( local_work_size[dim]) ;
2524
+ threadsPerBlock[dim] = local_work_size[dim];
2526
2525
return PI_SUCCESS;
2527
2526
};
2528
2527
@@ -2542,12 +2541,11 @@ pi_result hip_piEnqueueKernelLaunch(
2542
2541
return PI_INVALID_WORK_GROUP_SIZE;
2543
2542
}
2544
2543
2545
- int blocksPerGrid[3 ] = {1 , 1 , 1 };
2544
+ size_t blocksPerGrid[3 ] = {1u , 1u , 1u };
2546
2545
2547
2546
for (size_t i = 0 ; i < work_dim; i++) {
2548
2547
blocksPerGrid[i] =
2549
- static_cast <int >(global_work_size[i] + threadsPerBlock[i] - 1 ) /
2550
- threadsPerBlock[i];
2548
+ (global_work_size[i] + threadsPerBlock[i] - 1 ) / threadsPerBlock[i];
2551
2549
}
2552
2550
2553
2551
pi_result retError = PI_SUCCESS;
0 commit comments