@@ -244,6 +244,34 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) {
244
244
}
245
245
// / \endcond
246
246
247
+ // Determine local work sizes that result in uniform work groups.
248
+ // The default threadsPerBlock only require handling the first work_dim
249
+ // dimension.
250
+ void guessLocalWorkSize (int *threadsPerBlock, const size_t *global_work_size,
251
+ const size_t maxThreadsPerBlock[3 ], pi_kernel kernel) {
252
+ assert (threadsPerBlock != nullptr );
253
+ assert (global_work_size != nullptr );
254
+ assert (kernel != nullptr );
255
+ int recommendedBlockSize, minGrid;
256
+
257
+ PI_CHECK_ERROR (cuOccupancyMaxPotentialBlockSize (
258
+ &minGrid, &recommendedBlockSize, kernel->get (), NULL ,
259
+ kernel->get_local_size (), maxThreadsPerBlock[0 ]));
260
+
261
+ (void )minGrid; // Not used, avoid warnings
262
+
263
+ threadsPerBlock[0 ] =
264
+ std::min (static_cast <int >(maxThreadsPerBlock[0 ]),
265
+ std::min (static_cast <int >(global_work_size[0 ]),
266
+ static_cast <int >(recommendedBlockSize)));
267
+
268
+ // Find a local work group size that is a divisor of the global
269
+ // work group size to produce uniform work groups.
270
+ while (0u != (global_work_size[0 ] % threadsPerBlock[0 ])) {
271
+ --threadsPerBlock[0 ];
272
+ }
273
+ }
274
+
247
275
} // anonymous namespace
248
276
249
277
// / ------ Error handling, matching OpenCL plugin semantics.
@@ -2277,56 +2305,53 @@ pi_result cuda_piEnqueueKernelLaunch(
2277
2305
// Set the number of threads per block to the number of threads per warp
2278
2306
// by default unless user has provided a better number
2279
2307
int threadsPerBlock[3 ] = {32 , 1 , 1 };
2308
+ size_t maxWorkGroupSize = 0u ;
2309
+ size_t maxThreadsPerBlock[3 ] = {};
2310
+ bool providedLocalWorkGroupSize = (local_work_size != nullptr );
2280
2311
2281
2312
{
2282
- size_t maxThreadsPerBlock[3 ] = {};
2283
2313
pi_result retError = cuda_piDeviceGetInfo (
2284
2314
command_queue->device_ , PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
2285
2315
sizeof (maxThreadsPerBlock), maxThreadsPerBlock, nullptr );
2286
2316
assert (retError == PI_SUCCESS);
2287
2317
(void )retError;
2288
- size_t maxWorkGroupSize = 0 ;
2318
+
2289
2319
retError = cuda_piDeviceGetInfo (
2290
2320
command_queue->device_ , PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
2291
2321
sizeof (maxWorkGroupSize), &maxWorkGroupSize, nullptr );
2292
2322
assert (retError == PI_SUCCESS);
2293
2323
2294
- if (local_work_size ) {
2295
- for ( size_t i = 0 ; i < work_dim; i++ ) {
2296
- if (local_work_size[i ] > maxThreadsPerBlock[i ])
2324
+ if (providedLocalWorkGroupSize ) {
2325
+ auto isValid = [&]( int dim ) {
2326
+ if (local_work_size[dim ] > maxThreadsPerBlock[dim ])
2297
2327
return PI_INVALID_WORK_ITEM_SIZE;
2298
2328
// Checks that local work sizes are a divisor of the global work sizes
2299
2329
// which includes that the local work sizes are neither larger than the
2300
2330
// global work sizes and not 0.
2301
- if (0u == local_work_size[i ])
2331
+ if (0u == local_work_size[dim ])
2302
2332
return PI_INVALID_WORK_GROUP_SIZE;
2303
- if (0u != (global_work_size[i ] % local_work_size[i ]))
2333
+ if (0u != (global_work_size[dim ] % local_work_size[dim ]))
2304
2334
return PI_INVALID_WORK_GROUP_SIZE;
2305
- threadsPerBlock[i] = static_cast <int >(local_work_size[i]);
2306
- }
2307
- if (maxWorkGroupSize < size_t (threadsPerBlock[0 ] * threadsPerBlock[1 ] *
2308
- threadsPerBlock[2 ])) {
2309
- return PI_INVALID_WORK_GROUP_SIZE;
2335
+ threadsPerBlock[dim] = static_cast <int >(local_work_size[dim]);
2336
+ return PI_SUCCESS;
2337
+ };
2338
+
2339
+ for (size_t dim = 0 ; dim < work_dim; dim++) {
2340
+ auto err = isValid (dim);
2341
+ if (err != PI_SUCCESS)
2342
+ return err;
2310
2343
}
2311
2344
} else {
2312
- // Determine local work sizes that result in uniform work groups.
2313
- // The default threadsPerBlock only require handling the first work_dim
2314
- // dimension.
2315
- threadsPerBlock[0 ] =
2316
- std::min (static_cast <int >(maxThreadsPerBlock[0 ]),
2317
- std::min (static_cast <int >(global_work_size[0 ]),
2318
- static_cast <int >(threadsPerBlock[0 ])));
2319
- // Find a local work group size that is a divisor of the global
2320
- // work group size to produce uniform work groups.
2321
- while (0u != (global_work_size[0 ] % threadsPerBlock[0 ])) {
2322
- --threadsPerBlock[0 ];
2323
- }
2324
- assert (
2325
- maxWorkGroupSize >=
2326
- size_t (threadsPerBlock[0 ] * threadsPerBlock[1 ] * threadsPerBlock[2 ]));
2345
+ guessLocalWorkSize (threadsPerBlock, global_work_size, maxThreadsPerBlock,
2346
+ kernel);
2327
2347
}
2328
2348
}
2329
2349
2350
+ if (maxWorkGroupSize <
2351
+ size_t (threadsPerBlock[0 ] * threadsPerBlock[1 ] * threadsPerBlock[2 ])) {
2352
+ return PI_INVALID_WORK_GROUP_SIZE;
2353
+ }
2354
+
2330
2355
int blocksPerGrid[3 ] = {1 , 1 , 1 };
2331
2356
2332
2357
for (size_t i = 0 ; i < work_dim; i++) {
@@ -2340,8 +2365,8 @@ pi_result cuda_piEnqueueKernelLaunch(
2340
2365
2341
2366
try {
2342
2367
ScopedContext active (command_queue->get_context ());
2343
- CUfunction cuFunc = kernel->get ();
2344
2368
CUstream cuStream = command_queue->get ();
2369
+ CUfunction cuFunc = kernel->get ();
2345
2370
2346
2371
retError = cuda_piEnqueueEventsWait (command_queue, num_events_in_wait_list,
2347
2372
event_wait_list, nullptr );
0 commit comments