@@ -2267,9 +2267,71 @@ pi_result cuda_piEnqueueKernelLaunch(
2267
2267
assert (command_queue != nullptr );
2268
2268
assert (command_queue->get_context () == kernel->get_context ());
2269
2269
assert (kernel != nullptr );
2270
+ assert (global_work_offset != nullptr );
2270
2271
assert (work_dim > 0 );
2271
2272
assert (work_dim < 4 );
2272
2273
2274
+ // Set the number of threads per block to the number of threads per warp
2275
+ // by default unless user has provided a better number
2276
+ int threadsPerBlock[3 ] = {32 , 1 , 1 };
2277
+
2278
+ {
2279
+ size_t maxThreadsPerBlock[3 ] = {};
2280
+ pi_result retError = cuda_piDeviceGetInfo (
2281
+ command_queue->device_ , PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
2282
+ sizeof (maxThreadsPerBlock), maxThreadsPerBlock, nullptr );
2283
+ assert (retError == PI_SUCCESS);
2284
+ (void )retError;
2285
+ size_t maxWorkGroupSize = 0 ;
2286
+ retError = cuda_piDeviceGetInfo (
2287
+ command_queue->device_ , PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
2288
+ sizeof (maxWorkGroupSize), &maxWorkGroupSize, nullptr );
2289
+ assert (retError == PI_SUCCESS);
2290
+
2291
+ if (local_work_size) {
2292
+ for (size_t i = 0 ; i < work_dim; i++) {
2293
+ if (local_work_size[i] > maxThreadsPerBlock[i])
2294
+ return PI_INVALID_WORK_ITEM_SIZE;
2295
+ // Checks that local work sizes are a divisor of the global work sizes
2296
+ // which includes that the local work sizes are neither larger than the
2297
+ // global work sizes and not 0.
2298
+ if (0u == local_work_size[i])
2299
+ return PI_INVALID_WORK_GROUP_SIZE;
2300
+ if (0u != (global_work_size[i] % local_work_size[i]))
2301
+ return PI_INVALID_WORK_GROUP_SIZE;
2302
+ threadsPerBlock[i] = static_cast <int >(local_work_size[i]);
2303
+ }
2304
+ if (maxWorkGroupSize < size_t (threadsPerBlock[0 ] * threadsPerBlock[1 ] *
2305
+ threadsPerBlock[2 ])) {
2306
+ return PI_INVALID_WORK_GROUP_SIZE;
2307
+ }
2308
+ } else {
2309
+ // Determine local work sizes that result in uniform work groups.
2310
+ // The default threadsPerBlock only require handling the first work_dim
2311
+ // dimension.
2312
+ threadsPerBlock[0 ] =
2313
+ std::min (static_cast <int >(maxThreadsPerBlock[0 ]),
2314
+ std::min (static_cast <int >(global_work_size[0 ]),
2315
+ static_cast <int >(threadsPerBlock[0 ])));
2316
+ // Find a local work group size that is a divisor of the global
2317
+ // work group size to produce uniform work groups.
2318
+ while (0u != (global_work_size[0 ] % threadsPerBlock[0 ])) {
2319
+ --threadsPerBlock[0 ];
2320
+ }
2321
+ assert (
2322
+ maxWorkGroupSize >=
2323
+ size_t (threadsPerBlock[0 ] * threadsPerBlock[1 ] * threadsPerBlock[2 ]));
2324
+ }
2325
+ }
2326
+
2327
+ int blocksPerGrid[3 ] = {1 , 1 , 1 };
2328
+
2329
+ for (size_t i = 0 ; i < work_dim; i++) {
2330
+ blocksPerGrid[i] =
2331
+ static_cast <int >(global_work_size[i] + threadsPerBlock[i] - 1 ) /
2332
+ threadsPerBlock[i];
2333
+ }
2334
+
2273
2335
pi_result retError = PI_SUCCESS;
2274
2336
std::unique_ptr<_pi_event> retImplEv{nullptr };
2275
2337
@@ -2297,41 +2359,6 @@ pi_result cuda_piEnqueueKernelLaunch(
2297
2359
cuda_implicit_offset);
2298
2360
}
2299
2361
2300
- // Set the number of threads per block to the number of threads per warp
2301
- // by default unless user has provided a better number
2302
- int threadsPerBlock[3 ] = {32 , 1 , 1 };
2303
-
2304
- if (local_work_size) {
2305
- for (size_t i = 0 ; i < work_dim; i++) {
2306
- threadsPerBlock[i] = static_cast <int >(local_work_size[i]);
2307
- }
2308
- } else {
2309
- for (size_t i = 0 ; i < work_dim; i++) {
2310
- threadsPerBlock[i] = std::min (static_cast <int >(global_work_size[i]),
2311
- static_cast <int >(threadsPerBlock[i]));
2312
- }
2313
- }
2314
-
2315
- size_t maxThreadsPerBlock[3 ] = {};
2316
- retError = cuda_piDeviceGetInfo (
2317
- command_queue->device_ , PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
2318
- sizeof (maxThreadsPerBlock), maxThreadsPerBlock, nullptr );
2319
- assert (retError == PI_SUCCESS);
2320
-
2321
- for (size_t i = 0 ; i < work_dim; i++) {
2322
- if (size_t (threadsPerBlock[i]) > maxThreadsPerBlock[i]) {
2323
- return PI_INVALID_WORK_GROUP_SIZE;
2324
- }
2325
- }
2326
-
2327
- int blocksPerGrid[3 ] = {1 , 1 , 1 };
2328
-
2329
- for (size_t i = 0 ; i < work_dim; i++) {
2330
- blocksPerGrid[i] =
2331
- static_cast <int >(global_work_size[i] + threadsPerBlock[i] - 1 ) /
2332
- threadsPerBlock[i];
2333
- }
2334
-
2335
2362
auto argIndices = kernel->get_arg_indices ();
2336
2363
2337
2364
if (event) {
0 commit comments