Skip to content

Commit f7dd22c

Browse files
committed
[SYCL][CUDA] Check kernel launch params
The DPCPP runtime relies on piEnqueueKernelLaunch for NDRange parameter validity checks. Add missing checks to the PI CUDA backend. Signed-off-by: Bjoern Knafla <[email protected]>
1 parent 8a23977 commit f7dd22c

File tree

1 file changed

+62
-35
lines changed

1 file changed

+62
-35
lines changed

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 62 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2267,9 +2267,71 @@ pi_result cuda_piEnqueueKernelLaunch(
22672267
assert(command_queue != nullptr);
22682268
assert(command_queue->get_context() == kernel->get_context());
22692269
assert(kernel != nullptr);
2270+
assert(global_work_offset != nullptr);
22702271
assert(work_dim > 0);
22712272
assert(work_dim < 4);
22722273

2274+
// Set the number of threads per block to the number of threads per warp
2275+
// by default unless user has provided a better number
2276+
int threadsPerBlock[3] = {32, 1, 1};
2277+
2278+
{
2279+
size_t maxThreadsPerBlock[3] = {};
2280+
pi_result retError = cuda_piDeviceGetInfo(
2281+
command_queue->device_, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
2282+
sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
2283+
assert(retError == PI_SUCCESS);
2284+
(void)retError;
2285+
size_t maxWorkGroupSize = 0;
2286+
retError = cuda_piDeviceGetInfo(
2287+
command_queue->device_, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
2288+
sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
2289+
assert(retError == PI_SUCCESS);
2290+
2291+
if (local_work_size) {
2292+
for (size_t i = 0; i < work_dim; i++) {
2293+
if (local_work_size[i] > maxThreadsPerBlock[i])
2294+
return PI_INVALID_WORK_ITEM_SIZE;
2295+
// Checks that local work sizes are a divisor of the global work sizes
2296+
// which includes that the local work sizes are neither larger than the
2297+
// global work sizes and not 0.
2298+
if (0u == local_work_size[i])
2299+
return PI_INVALID_WORK_GROUP_SIZE;
2300+
if (0u != (global_work_size[i] % local_work_size[i]))
2301+
return PI_INVALID_WORK_GROUP_SIZE;
2302+
threadsPerBlock[i] = static_cast<int>(local_work_size[i]);
2303+
}
2304+
if (maxWorkGroupSize < size_t(threadsPerBlock[0] * threadsPerBlock[1] *
2305+
threadsPerBlock[2])) {
2306+
return PI_INVALID_WORK_GROUP_SIZE;
2307+
}
2308+
} else {
2309+
// Determine local work sizes that result in uniform work groups.
2310+
// The default threadsPerBlock only require handling the first work_dim
2311+
// dimension.
2312+
threadsPerBlock[0] =
2313+
std::min(static_cast<int>(maxThreadsPerBlock[0]),
2314+
std::min(static_cast<int>(global_work_size[0]),
2315+
static_cast<int>(threadsPerBlock[0])));
2316+
// Find a local work group size that is a divisor of the global
2317+
// work group size to produce uniform work groups.
2318+
while (0u != (global_work_size[0] % threadsPerBlock[0])) {
2319+
--threadsPerBlock[0];
2320+
}
2321+
assert(
2322+
maxWorkGroupSize >=
2323+
size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2]));
2324+
}
2325+
}
2326+
2327+
int blocksPerGrid[3] = {1, 1, 1};
2328+
2329+
for (size_t i = 0; i < work_dim; i++) {
2330+
blocksPerGrid[i] =
2331+
static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
2332+
threadsPerBlock[i];
2333+
}
2334+
22732335
pi_result retError = PI_SUCCESS;
22742336
std::unique_ptr<_pi_event> retImplEv{nullptr};
22752337

@@ -2297,41 +2359,6 @@ pi_result cuda_piEnqueueKernelLaunch(
22972359
cuda_implicit_offset);
22982360
}
22992361

2300-
// Set the number of threads per block to the number of threads per warp
2301-
// by default unless user has provided a better number
2302-
int threadsPerBlock[3] = {32, 1, 1};
2303-
2304-
if (local_work_size) {
2305-
for (size_t i = 0; i < work_dim; i++) {
2306-
threadsPerBlock[i] = static_cast<int>(local_work_size[i]);
2307-
}
2308-
} else {
2309-
for (size_t i = 0; i < work_dim; i++) {
2310-
threadsPerBlock[i] = std::min(static_cast<int>(global_work_size[i]),
2311-
static_cast<int>(threadsPerBlock[i]));
2312-
}
2313-
}
2314-
2315-
size_t maxThreadsPerBlock[3] = {};
2316-
retError = cuda_piDeviceGetInfo(
2317-
command_queue->device_, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
2318-
sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
2319-
assert(retError == PI_SUCCESS);
2320-
2321-
for (size_t i = 0; i < work_dim; i++) {
2322-
if (size_t(threadsPerBlock[i]) > maxThreadsPerBlock[i]) {
2323-
return PI_INVALID_WORK_GROUP_SIZE;
2324-
}
2325-
}
2326-
2327-
int blocksPerGrid[3] = {1, 1, 1};
2328-
2329-
for (size_t i = 0; i < work_dim; i++) {
2330-
blocksPerGrid[i] =
2331-
static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
2332-
threadsPerBlock[i];
2333-
}
2334-
23352362
auto argIndices = kernel->get_arg_indices();
23362363

23372364
if (event) {

0 commit comments

Comments
 (0)