@@ -194,60 +194,6 @@ mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
194
194
extra));
195
195
}
196
196
197
- extern " C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel (
198
- CUfunction function, intptr_t clusterX, intptr_t clusterY,
199
- intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
200
- intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem,
201
- CUstream stream, void **params, void **extra, size_t /* paramsCount*/ ) {
202
- ScopedContext scopedContext;
203
- if (smem > 0 ) {
204
- // Avoid checking driver as it's more expensive than if statement
205
- int32_t maxShmem = 0 ;
206
- CUdevice device = getDefaultCuDevice ();
207
- CUDA_REPORT_IF_ERROR (cuDeviceGet (&device, /* ordinal=*/ defaultDevice));
208
- CUDA_REPORT_IF_ERROR (cuDeviceGetAttribute (
209
- &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
210
- device));
211
- if (maxShmem < smem) {
212
- fprintf (stderr,
213
- " Requested shared memory (%dkb) is larger than maximum allowed "
214
- " shared memory (%dkb) for this device\n " ,
215
- smem, maxShmem);
216
- }
217
- CUDA_REPORT_IF_ERROR (cuFuncSetAttribute (
218
- function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
219
- }
220
- CUlaunchConfig config;
221
- config.gridDimX = gridX;
222
- config.gridDimY = gridY;
223
- config.gridDimZ = gridZ;
224
- config.blockDimX = blockX;
225
- config.blockDimY = blockY;
226
- config.blockDimZ = blockZ;
227
- config.sharedMemBytes = smem;
228
- config.hStream = stream;
229
- CUlaunchAttribute launchAttr[2 ];
230
- launchAttr[0 ].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
231
- launchAttr[0 ].value .clusterDim .x = clusterX;
232
- launchAttr[0 ].value .clusterDim .y = clusterY;
233
- launchAttr[0 ].value .clusterDim .z = clusterZ;
234
- launchAttr[1 ].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
235
- launchAttr[1 ].value .clusterSchedulingPolicyPreference =
236
- CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
237
- config.numAttrs = 2 ;
238
- config.attrs = launchAttr;
239
-
240
- debug_print (" Launching kernel,"
241
- " cluster: %ld, %ld, %ld, "
242
- " grid=%ld,%ld,%ld, "
243
- " threads: %ld, %ld, %ld, "
244
- " smem: %dkb\n " ,
245
- clusterX, clusterY, clusterZ, gridX, gridY, gridZ, blockX, blockY,
246
- blockZ, smem);
247
-
248
- CUDA_REPORT_IF_ERROR (cuLaunchKernelEx (&config, function, params, extra));
249
- }
250
-
251
197
extern " C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate () {
252
198
ScopedContext scopedContext;
253
199
CUstream stream = nullptr ;
@@ -383,6 +329,60 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
383
329
384
330
#if (CUDA_VERSION >= 12000)
385
331
332
+ extern " C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel (
333
+ CUfunction function, intptr_t clusterX, intptr_t clusterY,
334
+ intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
335
+ intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem,
336
+ CUstream stream, void **params, void **extra, size_t /* paramsCount*/ ) {
337
+ ScopedContext scopedContext;
338
+ if (smem > 0 ) {
339
+ // Avoid checking driver as it's more expensive than if statement
340
+ int32_t maxShmem = 0 ;
341
+ CUdevice device = getDefaultCuDevice ();
342
+ CUDA_REPORT_IF_ERROR (cuDeviceGet (&device, /* ordinal=*/ defaultDevice));
343
+ CUDA_REPORT_IF_ERROR (cuDeviceGetAttribute (
344
+ &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
345
+ device));
346
+ if (maxShmem < smem) {
347
+ fprintf (stderr,
348
+ " Requested shared memory (%dkb) is larger than maximum allowed "
349
+ " shared memory (%dkb) for this device\n " ,
350
+ smem, maxShmem);
351
+ }
352
+ CUDA_REPORT_IF_ERROR (cuFuncSetAttribute (
353
+ function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
354
+ }
355
+ CUlaunchConfig config;
356
+ config.gridDimX = gridX;
357
+ config.gridDimY = gridY;
358
+ config.gridDimZ = gridZ;
359
+ config.blockDimX = blockX;
360
+ config.blockDimY = blockY;
361
+ config.blockDimZ = blockZ;
362
+ config.sharedMemBytes = smem;
363
+ config.hStream = stream;
364
+ CUlaunchAttribute launchAttr[2 ];
365
+ launchAttr[0 ].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
366
+ launchAttr[0 ].value .clusterDim .x = clusterX;
367
+ launchAttr[0 ].value .clusterDim .y = clusterY;
368
+ launchAttr[0 ].value .clusterDim .z = clusterZ;
369
+ launchAttr[1 ].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
370
+ launchAttr[1 ].value .clusterSchedulingPolicyPreference =
371
+ CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
372
+ config.numAttrs = 2 ;
373
+ config.attrs = launchAttr;
374
+
375
+ debug_print (" Launching kernel,"
376
+ " cluster: %ld, %ld, %ld, "
377
+ " grid=%ld,%ld,%ld, "
378
+ " threads: %ld, %ld, %ld, "
379
+ " smem: %dkb\n " ,
380
+ clusterX, clusterY, clusterZ, gridX, gridY, gridZ, blockX, blockY,
381
+ blockZ, smem);
382
+
383
+ CUDA_REPORT_IF_ERROR (cuLaunchKernelEx (&config, function, params, extra));
384
+ }
385
+
386
386
extern " C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled (
387
387
CUtensorMap *tensorMap, // Tensor map object
388
388
CUtensorMapDataType tensorDataType, // Tensor data type
0 commit comments