Skip to content

Commit 4fabfd1

Browse files
authored
[SYCL][CUDA] Default block size attempts to maximize occupancy (#2724)
Using the cuOccupancyMaxPotentialBlockSize function from the CUDA driver, this patch tries to find a better block size for the default configuration. It takes into account the kernel properties and the dynamic local memory size required by the kernel.
1 parent 0b7dacf commit 4fabfd1

File tree

1 file changed

+53
-28
lines changed

1 file changed

+53
-28
lines changed

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 53 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,34 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) {
244244
}
245245
/// \endcond
246246

247+
// Determine local work sizes that result in uniform work groups.
248+
// The default threadsPerBlock only require handling the first work_dim
249+
// dimension.
250+
void guessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size,
251+
const size_t maxThreadsPerBlock[3], pi_kernel kernel) {
252+
assert(threadsPerBlock != nullptr);
253+
assert(global_work_size != nullptr);
254+
assert(kernel != nullptr);
255+
int recommendedBlockSize, minGrid;
256+
257+
PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
258+
&minGrid, &recommendedBlockSize, kernel->get(), NULL,
259+
kernel->get_local_size(), maxThreadsPerBlock[0]));
260+
261+
(void)minGrid; // Not used, avoid warnings
262+
263+
threadsPerBlock[0] =
264+
std::min(static_cast<int>(maxThreadsPerBlock[0]),
265+
std::min(static_cast<int>(global_work_size[0]),
266+
static_cast<int>(recommendedBlockSize)));
267+
268+
// Find a local work group size that is a divisor of the global
269+
// work group size to produce uniform work groups.
270+
while (0u != (global_work_size[0] % threadsPerBlock[0])) {
271+
--threadsPerBlock[0];
272+
}
273+
}
274+
247275
} // anonymous namespace
248276

249277
/// ------ Error handling, matching OpenCL plugin semantics.
@@ -2277,56 +2305,53 @@ pi_result cuda_piEnqueueKernelLaunch(
22772305
// Set the number of threads per block to the number of threads per warp
22782306
// by default unless user has provided a better number
22792307
int threadsPerBlock[3] = {32, 1, 1};
2308+
size_t maxWorkGroupSize = 0u;
2309+
size_t maxThreadsPerBlock[3] = {};
2310+
bool providedLocalWorkGroupSize = (local_work_size != nullptr);
22802311

22812312
{
2282-
size_t maxThreadsPerBlock[3] = {};
22832313
pi_result retError = cuda_piDeviceGetInfo(
22842314
command_queue->device_, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
22852315
sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
22862316
assert(retError == PI_SUCCESS);
22872317
(void)retError;
2288-
size_t maxWorkGroupSize = 0;
2318+
22892319
retError = cuda_piDeviceGetInfo(
22902320
command_queue->device_, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
22912321
sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
22922322
assert(retError == PI_SUCCESS);
22932323

2294-
if (local_work_size) {
2295-
for (size_t i = 0; i < work_dim; i++) {
2296-
if (local_work_size[i] > maxThreadsPerBlock[i])
2324+
if (providedLocalWorkGroupSize) {
2325+
auto isValid = [&](int dim) {
2326+
if (local_work_size[dim] > maxThreadsPerBlock[dim])
22972327
return PI_INVALID_WORK_ITEM_SIZE;
22982328
// Checks that local work sizes are a divisor of the global work sizes
22992329
// which includes that the local work sizes are neither larger than the
23002330
// global work sizes and not 0.
2301-
if (0u == local_work_size[i])
2331+
if (0u == local_work_size[dim])
23022332
return PI_INVALID_WORK_GROUP_SIZE;
2303-
if (0u != (global_work_size[i] % local_work_size[i]))
2333+
if (0u != (global_work_size[dim] % local_work_size[dim]))
23042334
return PI_INVALID_WORK_GROUP_SIZE;
2305-
threadsPerBlock[i] = static_cast<int>(local_work_size[i]);
2306-
}
2307-
if (maxWorkGroupSize < size_t(threadsPerBlock[0] * threadsPerBlock[1] *
2308-
threadsPerBlock[2])) {
2309-
return PI_INVALID_WORK_GROUP_SIZE;
2335+
threadsPerBlock[dim] = static_cast<int>(local_work_size[dim]);
2336+
return PI_SUCCESS;
2337+
};
2338+
2339+
for (size_t dim = 0; dim < work_dim; dim++) {
2340+
auto err = isValid(dim);
2341+
if (err != PI_SUCCESS)
2342+
return err;
23102343
}
23112344
} else {
2312-
// Determine local work sizes that result in uniform work groups.
2313-
// The default threadsPerBlock only require handling the first work_dim
2314-
// dimension.
2315-
threadsPerBlock[0] =
2316-
std::min(static_cast<int>(maxThreadsPerBlock[0]),
2317-
std::min(static_cast<int>(global_work_size[0]),
2318-
static_cast<int>(threadsPerBlock[0])));
2319-
// Find a local work group size that is a divisor of the global
2320-
// work group size to produce uniform work groups.
2321-
while (0u != (global_work_size[0] % threadsPerBlock[0])) {
2322-
--threadsPerBlock[0];
2323-
}
2324-
assert(
2325-
maxWorkGroupSize >=
2326-
size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2]));
2345+
guessLocalWorkSize(threadsPerBlock, global_work_size, maxThreadsPerBlock,
2346+
kernel);
23272347
}
23282348
}
23292349

2350+
if (maxWorkGroupSize <
2351+
size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
2352+
return PI_INVALID_WORK_GROUP_SIZE;
2353+
}
2354+
23302355
int blocksPerGrid[3] = {1, 1, 1};
23312356

23322357
for (size_t i = 0; i < work_dim; i++) {
@@ -2340,8 +2365,8 @@ pi_result cuda_piEnqueueKernelLaunch(
23402365

23412366
try {
23422367
ScopedContext active(command_queue->get_context());
2343-
CUfunction cuFunc = kernel->get();
23442368
CUstream cuStream = command_queue->get();
2369+
CUfunction cuFunc = kernel->get();
23452370

23462371
retError = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
23472372
event_wait_list, nullptr);

0 commit comments

Comments
 (0)