[mlir][gpu] Use alloc OP's host_shared in cuda runtime (#99035)

grypp · web-flow · commit 20861f1f2fdc · 2024-07-17T07:25:11.000+02:00
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -237,11 +237,18 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event,
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
-mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/, bool /*isHostShared*/) {
+mgpuMemAlloc(uint64_t sizeBytes, CUstream stream, bool isHostShared) {
   ScopedContext scopedContext;
   CUdeviceptr ptr = 0;
-  if (sizeBytes != 0)
-    CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
+  if (sizeBytes == 0)
+    return reinterpret_cast<void *>(ptr);
+
+  if (isHostShared) {
+    CUDA_REPORT_IF_ERROR(
+        cuMemAllocManaged(&ptr, sizeBytes, CU_MEM_ATTACH_GLOBAL));
+    return reinterpret_cast<void *>(ptr);
+  }
+  CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
   return reinterpret_cast<void *>(ptr);
 }
 
diff --git a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+// CHECK: 2000
+module attributes {gpu.container_module} {
+  func.func @main() {
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c1000_i32 = arith.constant 1000 : i32
+    %memref = gpu.alloc  host_shared () : memref<1xi32>
+    memref.store %c1000_i32, %memref[%c1] : memref<1xi32>
+    gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
+      %1 = memref.load %memref[%c1] : memref<1xi32>
+      %2 = arith.addi %1, %1 : i32
+      memref.store %2, %memref[%c1] : memref<1xi32>
+      gpu.terminator
+    }
+    %0 = memref.load %memref[%c1] : memref<1xi32>
+    vector.print %0 : i32
+    return
+  }
+}