-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[mlir][gpu] Use alloc OP's host_shared
in cuda runtime
#99035
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
`host_shared` on `gpu.alloc` means the memory will be avaiable on host and device. This means managed memory in the nvidia side. However, `host_shared` is unused in the runtime. This PR uses it to call cuMemAllocManaged.
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Guray Ozen (grypp) Changes
Full diff: https://github.com/llvm/llvm-project/pull/99035.diff 2 Files Affected:
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 09dc30365e37c..6a32309aa9e05 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -237,11 +237,18 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event,
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
-mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/, bool /*isHostShared*/) {
+mgpuMemAlloc(uint64_t sizeBytes, CUstream stream, bool isHostShared) {
ScopedContext scopedContext;
CUdeviceptr ptr = 0;
- if (sizeBytes != 0)
- CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
+ if (sizeBytes == 0)
+ return reinterpret_cast<void *>(ptr);
+
+ if (isHostShared) {
+ CUDA_REPORT_IF_ERROR(
+ cuMemAllocManaged(&ptr, sizeBytes, CU_MEM_ATTACH_GLOBAL));
+ return reinterpret_cast<void *>(ptr);
+ }
+ CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
return reinterpret_cast<void *>(ptr);
}
diff --git a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir
new file mode 100644
index 0000000000000..77fa0deffdd69
--- /dev/null
+++ b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-cpu-runner \
+// RUN: --shared-libs=%mlir_cuda_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+// CHECK: 2000
+module attributes {gpu.container_module} {
+ func.func @main() {
+ %c1 = arith.constant 1 : index
+ %c0 = arith.constant 0 : index
+ %c1000_i32 = arith.constant 1000 : i32
+ %memref = gpu.alloc host_shared () : memref<1xi32>
+ memref.store %c1000_i32, %memref[%c1] : memref<1xi32>
+ gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
+ %1 = memref.load %memref[%c1] : memref<1xi32>
+ %2 = arith.addi %1, %1 : i32
+ memref.store %2, %memref[%c1] : memref<1xi32>
+ gpu.terminator
+ }
+ %0 = memref.load %memref[%c1] : memref<1xi32>
+ vector.print %0 : i32
+ return
+ }
+}
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/138/builds/1358 Here is the relevant piece of the build log for the reference:
|
This fixes the unit test that is broken in #99035.
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250900
Summary: This fixes the unit test that is broken in #99035. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251693
host_shared
ongpu.alloc
means the memory will be avaiable on host and device. This means managed memory in the nvidia side. However,host_shared
is unused in the runtime. This PR uses it to call cuMemAllocManaged.