Skip to content

Commit 5388149

Browse files
committed
[mlir][cuda runtime] Set Max Dynamic Shared Memory Attribute
This works aims to address the issue related to larger shared memory usage in the MLIR CUDA runtime. Currently, when the shared memory usage exceeds 48KB, we need to set the CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES attribute of the CUDA kernel appropriately. This work takes care of that by setting the attribute as required. Additionally, it includes some debug prints for better visibility and troubleshooting. Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D156874
1 parent b953155 commit 5388149

File tree

1 file changed

+27
-3
lines changed

1 file changed

+27
-3
lines changed

mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,13 @@ bool isDebugEnabled() {
7373
__func__, __VA_ARGS__); \
7474
} while (0)
7575

76+
// Returns default CUdevice
77+
CUdevice getDefaultCuDevice() {
78+
CUdevice device;
79+
CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
80+
return device;
81+
}
82+
7683
// Make the primary context of the current default device current for the
7784
// duration
7885
// of the instance and restore the previous context on destruction.
@@ -83,11 +90,10 @@ class ScopedContext {
8390
// defaultDevice.
8491
static CUcontext context = [] {
8592
CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
86-
CUdevice device;
87-
CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
8893
CUcontext ctx;
8994
// Note: this does not affect the current context.
90-
CUDA_REPORT_IF_ERROR(cuDevicePrimaryCtxRetain(&ctx, device));
95+
CUDA_REPORT_IF_ERROR(
96+
cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice()));
9197
return ctx;
9298
}();
9399

@@ -140,6 +146,24 @@ mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
140146
intptr_t blockZ, int32_t smem, CUstream stream, void **params,
141147
void **extra) {
142148
ScopedContext scopedContext;
149+
int32_t maxShmem = 0;
150+
CUdevice device = getDefaultCuDevice();
151+
CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
152+
CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
153+
&maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
154+
device));
155+
if (maxShmem < smem) {
156+
fprintf(stderr,
157+
"Requested shared memory (%dkb) is larger than maximum allowed "
158+
"shared memory (%dkb) for this device\n",
159+
smem, maxShmem);
160+
}
161+
CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
162+
function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
163+
debug_print("Launching kernel, grid=%ld,%ld,%ld, "
164+
"threads: %ld, %ld, %ld, "
165+
"smem: %dkb\n",
166+
gridX, gridY, gridZ, blockX, blockY, blockZ, smem);
143167
CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
144168
blockY, blockZ, smem, stream, params,
145169
extra));

0 commit comments

Comments
 (0)