@@ -299,7 +299,7 @@ int getAttribute(pi_device device, hipDeviceAttribute_t attribute) {
299
299
}
300
300
// / \endcond
301
301
302
- void simpleGuessLocalWorkSize (int *threadsPerBlock,
302
+ void simpleGuessLocalWorkSize (size_t *threadsPerBlock,
303
303
const size_t *global_work_size,
304
304
const size_t maxThreadsPerBlock[3 ],
305
305
pi_kernel kernel) {
@@ -314,8 +314,7 @@ void simpleGuessLocalWorkSize(int *threadsPerBlock,
314
314
315
315
// (void)minGrid; // Not used, avoid warnings
316
316
317
- threadsPerBlock[0 ] = std::min (static_cast <int >(maxThreadsPerBlock[0 ]),
318
- static_cast <int >(global_work_size[0 ]));
317
+ threadsPerBlock[0 ] = std::min (maxThreadsPerBlock[0 ], global_work_size[0 ]);
319
318
320
319
// Find a local work group size that is a divisor of the global
321
320
// work group size to produce uniform work groups.
@@ -2501,7 +2500,7 @@ pi_result hip_piEnqueueKernelLaunch(
2501
2500
2502
2501
// Set the number of threads per block to the number of threads per warp
2503
2502
// by default unless user has provided a better number
2504
- int threadsPerBlock[3 ] = {32 , 1 , 1 };
2503
+ size_t threadsPerBlock[3 ] = {32u , 1u , 1u };
2505
2504
size_t maxWorkGroupSize = 0u ;
2506
2505
size_t maxThreadsPerBlock[3 ] = {};
2507
2506
bool providedLocalWorkGroupSize = (local_work_size != nullptr );
@@ -2531,7 +2530,7 @@ pi_result hip_piEnqueueKernelLaunch(
2531
2530
return PI_INVALID_WORK_GROUP_SIZE;
2532
2531
if (0u != (global_work_size[dim] % local_work_size[dim]))
2533
2532
return PI_INVALID_WORK_GROUP_SIZE;
2534
- threadsPerBlock[dim] = static_cast < int >( local_work_size[dim]) ;
2533
+ threadsPerBlock[dim] = local_work_size[dim];
2535
2534
return PI_SUCCESS;
2536
2535
};
2537
2536
@@ -2551,12 +2550,11 @@ pi_result hip_piEnqueueKernelLaunch(
2551
2550
return PI_INVALID_WORK_GROUP_SIZE;
2552
2551
}
2553
2552
2554
- int blocksPerGrid[3 ] = {1 , 1 , 1 };
2553
+ size_t blocksPerGrid[3 ] = {1u , 1u , 1u };
2555
2554
2556
2555
for (size_t i = 0 ; i < work_dim; i++) {
2557
2556
blocksPerGrid[i] =
2558
- static_cast <int >(global_work_size[i] + threadsPerBlock[i] - 1 ) /
2559
- threadsPerBlock[i];
2557
+ (global_work_size[i] + threadsPerBlock[i] - 1 ) / threadsPerBlock[i];
2560
2558
}
2561
2559
2562
2560
pi_result retError = PI_SUCCESS;
0 commit comments