[mlir][sparse][gpu] generate proper memcpy in/out host and device

aartbik · aartbik · commit 86888e420c41 · 2023-04-21T09:30:42.000-07:00
The host registration is a convenient way to get CUDA kernels running, but it may be slow and does not work for all buffer (like global constants). This revision uses the proper alloc copy dealloc chains for buffers, using asynchronous chains to increase overlap. The host registration mechanism is kept under a flag for the output, just for experimentation purposes while this project ramps up. Reviewed By: Peiming Differential Revision: https://reviews.llvm.org/D148682
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -76,32 +76,151 @@ static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule,
 }
 
 /// Constructs code to launch GPU kernel.
-static void genLaunchGPUFunc(OpBuilder &builder, gpu::GPUFuncOp gpuFunc,
-                             SmallVectorImpl<Value> &args,
-                             unsigned numThreads) {
+static Value genLaunchGPUFunc(OpBuilder &builder, gpu::GPUFuncOp gpuFunc,
+                              SmallVectorImpl<Value> &args,
+                              SmallVectorImpl<Value> &tokens,
+                              unsigned numThreads) {
   Location loc = gpuFunc->getLoc();
   Value none = TypedValue<::mlir::IntegerType>{};
   Value one = constantIndex(builder, loc, 1);
   Value numT = constantIndex(builder, loc, numThreads);
   gpu::KernelDim3 gridSize = {one, one, one};
   gpu::KernelDim3 blckSize = {numT, one, one};
-  builder.create<gpu::LaunchFuncOp>(loc, gpuFunc, gridSize, blckSize,
-                                    /*dynSharedMemSz*/ none, args);
+  return builder
+      .create<gpu::LaunchFuncOp>(loc, gpuFunc, gridSize, blckSize,
+                                 /*dynSharedMemSz*/ none, args,
+                                 builder.getType<gpu::AsyncTokenType>(), tokens)
+      .getAsyncToken();
 }
 
 /// Maps the provided ranked host buffer into the device address space.
 /// Writes from the host are guaranteed to be visible to device kernels
 /// that are launched afterwards. Writes from the device are guaranteed
 /// to be visible on the host after synchronizing with the device kernel
-/// completion.
+/// completion. Needs to cast the buffer to a unranked buffer.
 static Value genHostRegisterMemref(OpBuilder &builder, Location loc,
                                    Value mem) {
   MemRefType memTp = mem.getType().cast<MemRefType>();
   UnrankedMemRefType resTp =
       UnrankedMemRefType::get(memTp.getElementType(), /*memorySpace=*/0);
   Value cast = builder.create<memref::CastOp>(loc, resTp, mem);
   builder.create<gpu::HostRegisterOp>(loc, cast);
-  return mem; // convenience pass-through
+  return cast;
+}
+
+/// Unmaps the provided buffer, expecting the casted buffer.
+static void genHostUnregisterMemref(OpBuilder &builder, Location loc,
+                                    Value cast) {
+  builder.create<gpu::HostUnregisterOp>(loc, cast);
+}
+
+/// Generates first wait in an asynchronous chain.
+static Value genFirstWait(OpBuilder &builder, Location loc) {
+  Type tokenType = builder.getType<gpu::AsyncTokenType>();
+  return builder.create<gpu::WaitOp>(loc, tokenType, ValueRange())
+      .getAsyncToken();
+}
+
+/// Generates last, blocking wait in an asynchronous chain.
+static void genBlockingWait(OpBuilder &builder, Location loc,
+                            ValueRange operands) {
+  builder.create<gpu::WaitOp>(loc, Type(), operands);
+}
+
+/// Allocates memory on the device.
+/// TODO: A `host_shared` attribute could be used to indicate that
+///       the buffer is visible by both host and device, but lowering
+///       that feature does not seem to be fully supported yet.
+static gpu::AllocOp genAllocMemRef(OpBuilder &builder, Location loc, Value mem,
+                                   Value token) {
+  auto tp = mem.getType().cast<ShapedType>();
+  auto elemTp = tp.getElementType();
+  auto shape = tp.getShape();
+  auto memTp = MemRefType::get(shape, elemTp);
+  SmallVector<Value> dynamicSizes;
+  for (unsigned r = 0, rank = tp.getRank(); r < rank; r++) {
+    if (shape[r] == ShapedType::kDynamic) {
+      Value dim = constantIndex(builder, loc, r);
+      Value dimOp = builder.create<memref::DimOp>(loc, mem, dim);
+      dynamicSizes.push_back(dimOp);
+    }
+  }
+  return builder.create<gpu::AllocOp>(loc, TypeRange({memTp, token.getType()}),
+                                      token, dynamicSizes, ValueRange());
+}
+
+/// Deallocates memory from the device.
+static Value genDeallocMemRef(OpBuilder &builder, Location loc, Value mem,
+                              Value token) {
+  return builder.create<gpu::DeallocOp>(loc, token.getType(), token, mem)
+      .getAsyncToken();
+}
+
+/// Copies memory between host and device (direction is implicit).
+static Value genCopyMemRef(OpBuilder &builder, Location loc, Value dst,
+                           Value src, Value token) {
+  return builder.create<gpu::MemcpyOp>(loc, token.getType(), token, dst, src)
+      .getAsyncToken();
+}
+
+/// Prepares the outlined arguments, passing scalars and buffers in. Here we
+/// assume that the first buffer is the one allocated for output. We create
+/// a set of properly chained asynchronous allocation/copy pairs to increase
+/// overlap before launching the kernel.
+/// TODO: the output assumption may be a bit too brittle
+static Value genParametersIn(OpBuilder &builder, Location loc,
+                             SmallVectorImpl<Value> &scalars,
+                             SmallVectorImpl<Value> &buffers,
+                             SmallVectorImpl<Value> &args,
+                             SmallVectorImpl<Value> &tokens,
+                             bool useHostRegistrationForOut) {
+  Value out;
+  // Scalars are passed by value.
+  for (Value s : scalars)
+    args.push_back(s);
+  // Buffers are need to be made visible on device.
+  for (Value b : buffers) {
+    if (useHostRegistrationForOut) {
+      out = genHostRegisterMemref(builder, loc, b);
+      args.push_back(b);
+      useHostRegistrationForOut = false;
+      continue;
+    }
+    Value firstToken = genFirstWait(builder, loc);
+    auto alloc = genAllocMemRef(builder, loc, b, firstToken);
+    Value devMem = alloc.getResult(0);
+    Value depToken = alloc.getAsyncToken(); // copy-after-alloc
+    args.push_back(devMem);
+    tokens.push_back(genCopyMemRef(builder, loc, devMem, b, depToken));
+  }
+  return out;
+}
+
+/// Finalizes the outlined arguments. The output buffer is copied depending
+/// on the kernel token and then deallocated. All other buffers are simply
+/// deallocated. Then we wait for all operations to complete.
+static void genParametersOut(OpBuilder &builder, Location loc, Value out,
+                             Value kernelToken, SmallVectorImpl<Value> &scalars,
+                             SmallVectorImpl<Value> &buffers,
+                             SmallVectorImpl<Value> &args,
+                             SmallVectorImpl<Value> &tokens) {
+  unsigned base = scalars.size();
+  for (unsigned i = base, e = args.size(); i < e; i++) {
+    Value firstToken;
+    if (i == base) {
+      // Assumed output parameter: unregister or copy-out.
+      if (out) {
+        genHostUnregisterMemref(builder, loc, out);
+        out = Value();
+        continue;
+      }
+      firstToken =
+          genCopyMemRef(builder, loc, buffers[0], args[i], kernelToken);
+    } else {
+      firstToken = genFirstWait(builder, loc);
+    }
+    tokens.push_back(genDeallocMemRef(builder, loc, args[i], firstToken));
+  }
 }
 
 /// Constructs code for new GPU kernel.
@@ -158,10 +277,8 @@ static void genGPUCode(PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc,
 
 /// Proof-of-concept rewriter. This rule generates a CUDA implementation
 /// for each outermost forall loop generated by the sparse compiler.
-//
-// TODO: right works with parallelization-strategy=dense-outer-loop
-//       but give this its own flags in the future
-//
+/// TODO: right works with parallelization-strategy=dense-outer-loop
+///       but give this its own flags in the future
 struct ForallRewriter : public OpRewritePattern<scf::ParallelOp> {
   using OpRewritePattern<scf::ParallelOp>::OpRewritePattern;
 
@@ -211,22 +328,34 @@ struct ForallRewriter : public OpRewritePattern<scf::ParallelOp> {
       else
         return failure(); // don't know how to share
     }
-    // Prepare the outlined arguments, register buffers.
+    // Pass outlined non-constant values.
+    // TODO: Experiment with `useHostRegistrationForOut` to see if we want to
+    //       keep the feature at all (either through a heuristic or compiler
+    //       option for gpu codegen).
     Location loc = forallOp->getLoc();
     SmallVector<Value> args;
-    for (Value s : scalars)
-      args.push_back(s);
-    for (Value b : buffers)
-      args.push_back(genHostRegisterMemref(rewriter, loc, b));
-    auto saveIp = rewriter.saveInsertionPoint();
+    SmallVector<Value> tokens;
+    Value out = genParametersIn(rewriter, loc, scalars, buffers, args, tokens,
+                                /*useHostRegistrationForOut=*/false);
     // Set up GPU module and construct GPU function.
+    auto saveIp = rewriter.saveInsertionPoint();
     ModuleOp topModule = forallOp->getParentOfType<ModuleOp>();
     auto gpuModule = genGPUModule(rewriter, topModule);
     auto gpuFunc = genGPUFunc(rewriter, gpuModule, args);
     genGPUCode(rewriter, gpuFunc, forallOp, constants, scalars, buffers);
-    // Generate code that launches the kernel.
+    // Generate code that launches the kernel asynchronously, blocking on all
+    // opens tokens and yielding a new token for the output.
+    // TODO: Passing in tokens to launch up does not seem to be properly lowered
+    //       by cubin yet, hence the current blocking wait.
     rewriter.restoreInsertionPoint(saveIp);
-    genLaunchGPUFunc(rewriter, gpuFunc, args, numThreads);
+    genBlockingWait(rewriter, loc, tokens);
+    tokens.clear();
+    Value kernelToken =
+        genLaunchGPUFunc(rewriter, gpuFunc, args, tokens, numThreads);
+    // Finalize the outlined arguments.
+    genParametersOut(rewriter, loc, out, kernelToken, scalars, buffers, args,
+                     tokens);
+    genBlockingWait(rewriter, loc, tokens);
     rewriter.eraseOp(forallOp);
     return success();
   }
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir
@@ -7,12 +7,46 @@
 
 //
 // CHECK-LABEL: gpu.module @sparse_kernels
-// CHECK-DAG:   gpu.func @kernel0
-// CHECK-DAG:   gpu.func @kernel1
+// CHECK:       gpu.func @kernel1
+// CHECK:       gpu.func @kernel0
 //
 // CHECK-LABEL: func.func @matmuls
-// CHECK-DAG:   gpu.launch_func @sparse_kernels::@kernel0 blocks
-// CHECK-DAG:   gpu.launch_func @sparse_kernels::@kernel1 blocks
+// CHECK:       gpu.alloc async
+// CHECK:       gpu.memcpy async
+// CHECK:       gpu.alloc async
+// CHECK:       gpu.memcpy async
+// CHECK:       gpu.alloc async
+// CHECK:       gpu.memcpy async
+// CHECK:       gpu.alloc async
+// CHECK:       gpu.memcpy async
+// CHECK:       gpu.alloc async
+// CHECK:       gpu.memcpy async
+// CHECK:       %[[T1:.*]] = gpu.launch_func async @sparse_kernels::@kernel1 blocks
+// CHECK:       gpu.memcpy async [%[[T1]]]
+// CHECK:       gpu.dealloc async
+// CHECK:       gpu.dealloc async
+// CHECK:       gpu.dealloc async
+// CHECK:       gpu.dealloc async
+// CHECK:       gpu.dealloc async
+// CHECK:       gpu.wait
+// CHECK:       gpu.alloc async
+// CHECK:       gpu.memcpy async
+// CHECK:       gpu.alloc async
+// CHECK:       gpu.memcpy async
+// CHECK:       gpu.alloc async
+// CHECK:       gpu.memcpy async
+// CHECK:       gpu.alloc async
+// CHECK:       gpu.memcpy async
+// CHECK:       gpu.alloc async
+// CHECK:       gpu.memcpy async
+// CHECK:       %[[T0:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks
+// CHECK:       gpu.memcpy async [%[[T0]]]
+// CHECK:       gpu.dealloc async
+// CHECK:       gpu.dealloc async
+// CHECK:       gpu.dealloc async
+// CHECK:       gpu.dealloc async
+// CHECK:       gpu.dealloc async
+// CHECK:       gpu.wait
 //
 func.func @matmuls(%A: tensor<1024x8xf64>,
                    %B: tensor<8x1024xf64, #CSR>,
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir
@@ -47,12 +47,34 @@
 //
 //
 // CHECK-LABEL: func.func @matmul
-// CHECK:       gpu.host_register
-// CHECK:       gpu.host_register
-// CHECK:       gpu.host_register
-// CHECK:       gpu.host_register
-// CHECK:       gpu.host_register
-// CHECK:       gpu.launch_func @sparse_kernels::@kernel0 blocks
+// CHECK:       gpu.wait async
+// CHECK:       gpu.alloc async
+// CHECK:       %[[S0:.*]] = gpu.memcpy async
+// CHECK:       gpu.wait async
+// CHECK:       gpu.alloc async
+// CHECK:       %[[S1:.*]] = gpu.memcpy async
+// CHECK:       gpu.wait async
+// CHECK:       gpu.alloc async
+// CHECK:       %[[S2:.*]] = gpu.memcpy async
+// CHECK:       gpu.wait async
+// CHECK:       gpu.alloc async
+// CHECK:       %[[S3:.*]] = gpu.memcpy async
+// CHECK:       gpu.wait async
+// CHECK:       gpu.alloc async
+// CHECK:       %[[S4:.*]] = gpu.memcpy async
+// CHECK:       gpu.wait [%[[S0]], %[[S1]], %[[S2]], %[[S3]], %[[S4]]
+// CHECK:       %[[T0:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks
+// CHECK:       %[[M0:.*]] = gpu.memcpy async [%[[T0]]]
+// CHECK:       %[[M1:.*]] = gpu.dealloc async [%[[M0]]]
+// CHECK:       %[[M2:.*]] = gpu.wait async
+// CHECK:       %[[M3:.*]] = gpu.dealloc async [%[[M2]]]
+// CHECK:       %[[M4:.*]] = gpu.wait async
+// CHECK:       %[[M5:.*]] = gpu.dealloc async [%[[M4]]]
+// CHECK:       %[[M6:.*]] = gpu.wait async
+// CHECK:       %[[M7:.*]] = gpu.dealloc async [%[[M6]]]
+// CHECK:       %[[M8:.*]] = gpu.wait async
+// CHECK:       %[[M9:.*]] = gpu.dealloc async [%[[M8]]]
+// CHECK:       gpu.wait [%[[M1]], %[[M3]], %[[M5]], %[[M7]], %[[M9]]
 //
 func.func @matmul(%A: tensor<?x?xf64, #CSR>, %B: tensor<?x?xf64>, %C_in: tensor<?x?xf64>) -> tensor<?x?xf64> {
   %C_out = linalg.matmul
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir
@@ -43,12 +43,34 @@
 // CHECK:       }
 //
 // CHECK-LABEL: func.func @matvec
-// CHECK:       gpu.host_register
-// CHECK:       gpu.host_register
-// CHECK:       gpu.host_register
-// CHECK:       gpu.host_register
-// CHECK:       gpu.host_register
-// CHECK:       gpu.launch_func @sparse_kernels::@kernel0 blocks
+// CHECK:       gpu.wait async
+// CHECK:       gpu.alloc async
+// CHECK:       %[[S0:.*]] = gpu.memcpy async
+// CHECK:       gpu.wait async
+// CHECK:       gpu.alloc async
+// CHECK:       %[[S1:.*]] = gpu.memcpy async
+// CHECK:       gpu.wait async
+// CHECK:       gpu.alloc async
+// CHECK:       %[[S2:.*]] = gpu.memcpy async
+// CHECK:       gpu.wait async
+// CHECK:       gpu.alloc async
+// CHECK:       %[[S3:.*]] = gpu.memcpy async
+// CHECK:       gpu.wait async
+// CHECK:       gpu.alloc async
+// CHECK:       %[[S4:.*]] = gpu.memcpy async
+// CHECK:       gpu.wait [%[[S0]], %[[S1]], %[[S2]], %[[S3]], %[[S4]]
+// CHECK:       %[[T0:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks
+// CHECK:       %[[M0:.*]] = gpu.memcpy async [%[[T0]]]
+// CHECK:       %[[M1:.*]] = gpu.dealloc async [%[[M0]]]
+// CHECK:       %[[M2:.*]] = gpu.wait async
+// CHECK:       %[[M3:.*]] = gpu.dealloc async [%[[M2]]]
+// CHECK:       %[[M4:.*]] = gpu.wait async
+// CHECK:       %[[M5:.*]] = gpu.dealloc async [%[[M4]]]
+// CHECK:       %[[M6:.*]] = gpu.wait async
+// CHECK:       %[[M7:.*]] = gpu.dealloc async [%[[M6]]]
+// CHECK:       %[[M8:.*]] = gpu.wait async
+// CHECK:       %[[M9:.*]] = gpu.dealloc async [%[[M8]]]
+// CHECK:       gpu.wait [%[[M1]], %[[M3]], %[[M5]], %[[M7]], %[[M9]]
 //
 func.func @matvec(%A: tensor<?x?xf64, #CSR>, %x: tensor<?xf64>, %y_in: tensor<?xf64>) -> tensor<?xf64> {
   %y_out = linalg.matvec
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir