Skip to content

Commit f21a70f

Browse files
authored
[mlir][cuda] Guard mgpuLaunchClusterKernel for Cuda 12.0+ (NFC) (#73495)
1 parent 344b534 commit f21a70f

File tree

1 file changed

+54
-54
lines changed

1 file changed

+54
-54
lines changed

mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Lines changed: 54 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -194,60 +194,6 @@ mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
194194
extra));
195195
}
196196

197-
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel(
198-
CUfunction function, intptr_t clusterX, intptr_t clusterY,
199-
intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
200-
intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem,
201-
CUstream stream, void **params, void **extra, size_t /*paramsCount*/) {
202-
ScopedContext scopedContext;
203-
if (smem > 0) {
204-
// Avoid checking driver as it's more expensive than if statement
205-
int32_t maxShmem = 0;
206-
CUdevice device = getDefaultCuDevice();
207-
CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
208-
CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
209-
&maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
210-
device));
211-
if (maxShmem < smem) {
212-
fprintf(stderr,
213-
"Requested shared memory (%dkb) is larger than maximum allowed "
214-
"shared memory (%dkb) for this device\n",
215-
smem, maxShmem);
216-
}
217-
CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
218-
function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
219-
}
220-
CUlaunchConfig config;
221-
config.gridDimX = gridX;
222-
config.gridDimY = gridY;
223-
config.gridDimZ = gridZ;
224-
config.blockDimX = blockX;
225-
config.blockDimY = blockY;
226-
config.blockDimZ = blockZ;
227-
config.sharedMemBytes = smem;
228-
config.hStream = stream;
229-
CUlaunchAttribute launchAttr[2];
230-
launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
231-
launchAttr[0].value.clusterDim.x = clusterX;
232-
launchAttr[0].value.clusterDim.y = clusterY;
233-
launchAttr[0].value.clusterDim.z = clusterZ;
234-
launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
235-
launchAttr[1].value.clusterSchedulingPolicyPreference =
236-
CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
237-
config.numAttrs = 2;
238-
config.attrs = launchAttr;
239-
240-
debug_print("Launching kernel,"
241-
"cluster: %ld, %ld, %ld, "
242-
"grid=%ld,%ld,%ld, "
243-
"threads: %ld, %ld, %ld, "
244-
"smem: %dkb\n",
245-
clusterX, clusterY, clusterZ, gridX, gridY, gridZ, blockX, blockY,
246-
blockZ, smem);
247-
248-
CUDA_REPORT_IF_ERROR(cuLaunchKernelEx(&config, function, params, extra));
249-
}
250-
251197
extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() {
252198
ScopedContext scopedContext;
253199
CUstream stream = nullptr;
@@ -383,6 +329,60 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
383329

384330
#if (CUDA_VERSION >= 12000)
385331

332+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel(
333+
CUfunction function, intptr_t clusterX, intptr_t clusterY,
334+
intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
335+
intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem,
336+
CUstream stream, void **params, void **extra, size_t /*paramsCount*/) {
337+
ScopedContext scopedContext;
338+
if (smem > 0) {
339+
// Avoid checking driver as it's more expensive than if statement
340+
int32_t maxShmem = 0;
341+
CUdevice device = getDefaultCuDevice();
342+
CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
343+
CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
344+
&maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
345+
device));
346+
if (maxShmem < smem) {
347+
fprintf(stderr,
348+
"Requested shared memory (%dkb) is larger than maximum allowed "
349+
"shared memory (%dkb) for this device\n",
350+
smem, maxShmem);
351+
}
352+
CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
353+
function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
354+
}
355+
CUlaunchConfig config;
356+
config.gridDimX = gridX;
357+
config.gridDimY = gridY;
358+
config.gridDimZ = gridZ;
359+
config.blockDimX = blockX;
360+
config.blockDimY = blockY;
361+
config.blockDimZ = blockZ;
362+
config.sharedMemBytes = smem;
363+
config.hStream = stream;
364+
CUlaunchAttribute launchAttr[2];
365+
launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
366+
launchAttr[0].value.clusterDim.x = clusterX;
367+
launchAttr[0].value.clusterDim.y = clusterY;
368+
launchAttr[0].value.clusterDim.z = clusterZ;
369+
launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
370+
launchAttr[1].value.clusterSchedulingPolicyPreference =
371+
CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
372+
config.numAttrs = 2;
373+
config.attrs = launchAttr;
374+
375+
debug_print("Launching kernel,"
376+
"cluster: %ld, %ld, %ld, "
377+
"grid=%ld,%ld,%ld, "
378+
"threads: %ld, %ld, %ld, "
379+
"smem: %dkb\n",
380+
clusterX, clusterY, clusterZ, gridX, gridY, gridZ, blockX, blockY,
381+
blockZ, smem);
382+
383+
CUDA_REPORT_IF_ERROR(cuLaunchKernelEx(&config, function, params, extra));
384+
}
385+
386386
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled(
387387
CUtensorMap *tensorMap, // Tensor map object
388388
CUtensorMapDataType tensorDataType, // Tensor data type

0 commit comments

Comments
 (0)