Skip to content

Commit afeb8a6

Browse files
authored
[SYCL][CUDA] Handle large Y/Z range dimensions. (#7968)
The dimensions passed to sycl::range, determine the blocks per grid and threads per blocks. Currently, calculation of thread per blocks only performed for the x dimension. This means the blocks per grid for y and z dimensions passed to cuLaunchKernel, directly come from the sycl::range arguments. This can result in an error returned on calling cuLaunchKernel, when those parameters for y and z dimensions are larger than 65535. This PR offers a simple tuning of thread per block for larger (over 65535) values of Y and Z dimensions to make the associated blocks per grid within the allowed range.
1 parent ba51bb6 commit afeb8a6

File tree

1 file changed

+19
-9
lines changed

1 file changed

+19
-9
lines changed

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -278,23 +278,32 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) {
278278
// Determine local work sizes that result in uniform work groups.
279279
// The default threadsPerBlock only require handling the first work_dim
280280
// dimension.
281-
void guessLocalWorkSize(size_t *threadsPerBlock, const size_t *global_work_size,
281+
void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
282+
const size_t *global_work_size,
282283
const size_t maxThreadsPerBlock[3], pi_kernel kernel,
283284
pi_uint32 local_size) {
284285
assert(threadsPerBlock != nullptr);
285286
assert(global_work_size != nullptr);
286287
assert(kernel != nullptr);
287-
int recommendedBlockSize, minGrid;
288+
int minGrid, maxBlockSize, gridDim[3];
289+
290+
cuDeviceGetAttribute(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
291+
device->get());
292+
cuDeviceGetAttribute(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
293+
device->get());
294+
295+
threadsPerBlock[1] = ((global_work_size[1] - 1) / gridDim[1]) + 1;
296+
threadsPerBlock[2] = ((global_work_size[2] - 1) / gridDim[2]) + 1;
288297

289298
PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
290-
&minGrid, &recommendedBlockSize, kernel->get(), NULL, local_size,
299+
&minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
291300
maxThreadsPerBlock[0]));
292301

293-
(void)minGrid; // Not used, avoid warnings
302+
gridDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
294303

295-
threadsPerBlock[0] = std::min(
296-
maxThreadsPerBlock[0],
297-
std::min(global_work_size[0], static_cast<size_t>(recommendedBlockSize)));
304+
threadsPerBlock[0] =
305+
std::min(maxThreadsPerBlock[0],
306+
std::min(global_work_size[0], static_cast<size_t>(gridDim[0])));
298307

299308
// Find a local work group size that is a divisor of the global
300309
// work group size to produce uniform work groups.
@@ -3124,8 +3133,9 @@ pi_result cuda_piEnqueueKernelLaunch(
31243133
return err;
31253134
}
31263135
} else {
3127-
guessLocalWorkSize(threadsPerBlock, global_work_size,
3128-
maxThreadsPerBlock, kernel, local_size);
3136+
guessLocalWorkSize(command_queue->device_, threadsPerBlock,
3137+
global_work_size, maxThreadsPerBlock, kernel,
3138+
local_size);
31293139
}
31303140
}
31313141

0 commit comments

Comments
 (0)