Skip to content

Commit 6909c06

Browse files
authored
[SYCL][CUDA][LIT] Extend test for non-OpenCL (#1654)
Rewrite parallel_for_range.cpp test to work with non-OpenCL PI backends that behave like OpenCL 1.2. The DPCPP runtime relies on piEnqueueKernelLaunch for NDRange parameter validity checks. Add missing checks to the PI CUDA backend. Level0 testing times out on Windows, mark `windows` as unsupported. Signed-off-by: Bjoern Knafla <[email protected]>
1 parent d8fe68e commit 6909c06

File tree

2 files changed

+229
-155
lines changed

2 files changed

+229
-155
lines changed

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 62 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2267,9 +2267,71 @@ pi_result cuda_piEnqueueKernelLaunch(
22672267
assert(command_queue != nullptr);
22682268
assert(command_queue->get_context() == kernel->get_context());
22692269
assert(kernel != nullptr);
2270+
assert(global_work_offset != nullptr);
22702271
assert(work_dim > 0);
22712272
assert(work_dim < 4);
22722273

2274+
// Set the number of threads per block to the number of threads per warp
2275+
// by default unless user has provided a better number
2276+
int threadsPerBlock[3] = {32, 1, 1};
2277+
2278+
{
2279+
size_t maxThreadsPerBlock[3] = {};
2280+
pi_result retError = cuda_piDeviceGetInfo(
2281+
command_queue->device_, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
2282+
sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
2283+
assert(retError == PI_SUCCESS);
2284+
(void)retError;
2285+
size_t maxWorkGroupSize = 0;
2286+
retError = cuda_piDeviceGetInfo(
2287+
command_queue->device_, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
2288+
sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
2289+
assert(retError == PI_SUCCESS);
2290+
2291+
if (local_work_size) {
2292+
for (size_t i = 0; i < work_dim; i++) {
2293+
if (local_work_size[i] > maxThreadsPerBlock[i])
2294+
return PI_INVALID_WORK_ITEM_SIZE;
2295+
// Checks that local work sizes are a divisor of the global work sizes
2296+
// which includes that the local work sizes are neither larger than the
2297+
// global work sizes and not 0.
2298+
if (0u == local_work_size[i])
2299+
return PI_INVALID_WORK_GROUP_SIZE;
2300+
if (0u != (global_work_size[i] % local_work_size[i]))
2301+
return PI_INVALID_WORK_GROUP_SIZE;
2302+
threadsPerBlock[i] = static_cast<int>(local_work_size[i]);
2303+
}
2304+
if (maxWorkGroupSize < size_t(threadsPerBlock[0] * threadsPerBlock[1] *
2305+
threadsPerBlock[2])) {
2306+
return PI_INVALID_WORK_GROUP_SIZE;
2307+
}
2308+
} else {
2309+
// Determine local work sizes that result in uniform work groups.
2310+
// The default threadsPerBlock only require handling the first work_dim
2311+
// dimension.
2312+
threadsPerBlock[0] =
2313+
std::min(static_cast<int>(maxThreadsPerBlock[0]),
2314+
std::min(static_cast<int>(global_work_size[0]),
2315+
static_cast<int>(threadsPerBlock[0])));
2316+
// Find a local work group size that is a divisor of the global
2317+
// work group size to produce uniform work groups.
2318+
while (0u != (global_work_size[0] % threadsPerBlock[0])) {
2319+
--threadsPerBlock[0];
2320+
}
2321+
assert(
2322+
maxWorkGroupSize >=
2323+
size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2]));
2324+
}
2325+
}
2326+
2327+
int blocksPerGrid[3] = {1, 1, 1};
2328+
2329+
for (size_t i = 0; i < work_dim; i++) {
2330+
blocksPerGrid[i] =
2331+
static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
2332+
threadsPerBlock[i];
2333+
}
2334+
22732335
pi_result retError = PI_SUCCESS;
22742336
std::unique_ptr<_pi_event> retImplEv{nullptr};
22752337

@@ -2297,41 +2359,6 @@ pi_result cuda_piEnqueueKernelLaunch(
22972359
cuda_implicit_offset);
22982360
}
22992361

2300-
// Set the number of threads per block to the number of threads per warp
2301-
// by default unless user has provided a better number
2302-
int threadsPerBlock[3] = {32, 1, 1};
2303-
2304-
if (local_work_size) {
2305-
for (size_t i = 0; i < work_dim; i++) {
2306-
threadsPerBlock[i] = static_cast<int>(local_work_size[i]);
2307-
}
2308-
} else {
2309-
for (size_t i = 0; i < work_dim; i++) {
2310-
threadsPerBlock[i] = std::min(static_cast<int>(global_work_size[i]),
2311-
static_cast<int>(threadsPerBlock[i]));
2312-
}
2313-
}
2314-
2315-
size_t maxThreadsPerBlock[3] = {};
2316-
retError = cuda_piDeviceGetInfo(
2317-
command_queue->device_, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
2318-
sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
2319-
assert(retError == PI_SUCCESS);
2320-
2321-
for (size_t i = 0; i < work_dim; i++) {
2322-
if (size_t(threadsPerBlock[i]) > maxThreadsPerBlock[i]) {
2323-
return PI_INVALID_WORK_GROUP_SIZE;
2324-
}
2325-
}
2326-
2327-
int blocksPerGrid[3] = {1, 1, 1};
2328-
2329-
for (size_t i = 0; i < work_dim; i++) {
2330-
blocksPerGrid[i] =
2331-
static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
2332-
threadsPerBlock[i];
2333-
}
2334-
23352362
auto argIndices = kernel->get_arg_indices();
23362363

23372364
if (event) {

0 commit comments

Comments
 (0)