-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[flang][cuda] Convert gpu.launch_func to CUFLaunchClusterKernel when cluster dims are present #113959
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[flang][cuda] Convert gpu.launch_func to CUFLaunchClusterKernel when cluster dims are present #113959
Conversation
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesKernel launch in CUF are converted to Full diff: https://github.com/llvm/llvm-project/pull/113959.diff 2 Files Affected:
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 5645ce6e6858c8..c64f35542a6e59 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -76,11 +76,6 @@ struct GPULaunchKernelConversion
mlir::LogicalResult
matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
-
- if (op.hasClusterSize()) {
- return mlir::failure();
- }
-
mlir::Location loc = op.getLoc();
auto *ctx = rewriter.getContext();
mlir::ModuleOp mod = op->getParentOfType<mlir::ModuleOp>();
@@ -107,37 +102,65 @@ struct GPULaunchKernelConversion
rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, kernel.getName());
}
- auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
- RTNAME_STRING(CUFLaunchKernel));
-
auto llvmIntPtrType = mlir::IntegerType::get(
ctx, this->getTypeConverter()->getPointerBitwidth(0));
auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
- auto funcTy = mlir::LLVM::LLVMFunctionType::get(
- voidTy,
- {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
- llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
- /*isVarArg=*/false);
-
- auto cufLaunchKernel = mlir::SymbolRefAttr::get(
- mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
- if (!funcOp) {
- mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
- rewriter.setInsertionPointToStart(mod.getBody());
- auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
- loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
- launchKernelFuncOp.setVisibility(mlir::SymbolTable::Visibility::Private);
- }
mlir::Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, ptrTy);
- rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
- op, funcTy, cufLaunchKernel,
- mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
- adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
- adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
- adaptor.getBlockSizeZ(), dynamicMemorySize, kernelArgs,
- nullPtr});
+ if (op.hasClusterSize()) {
+ auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
+ RTNAME_STRING(CUFLaunchClusterKernel));
+ auto funcTy = mlir::LLVM::LLVMFunctionType::get(
+ voidTy,
+ {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
+ llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
+ llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
+ /*isVarArg=*/false);
+ auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
+ mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
+ if (!funcOp) {
+ mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
+ rewriter.setInsertionPointToStart(mod.getBody());
+ auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
+ loc, RTNAME_STRING(CUFLaunchClusterKernel), funcTy);
+ launchKernelFuncOp.setVisibility(
+ mlir::SymbolTable::Visibility::Private);
+ }
+ rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
+ op, funcTy, cufLaunchClusterKernel,
+ mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
+ adaptor.getClusterSizeY(), adaptor.getClusterSizeZ(),
+ adaptor.getGridSizeX(), adaptor.getGridSizeY(),
+ adaptor.getGridSizeZ(), adaptor.getBlockSizeX(),
+ adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
+ dynamicMemorySize, kernelArgs, nullPtr});
+ } else {
+ auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
+ RTNAME_STRING(CUFLaunchKernel));
+ auto funcTy = mlir::LLVM::LLVMFunctionType::get(
+ voidTy,
+ {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
+ llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
+ /*isVarArg=*/false);
+ auto cufLaunchKernel = mlir::SymbolRefAttr::get(
+ mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
+ if (!funcOp) {
+ mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
+ rewriter.setInsertionPointToStart(mod.getBody());
+ auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
+ loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
+ launchKernelFuncOp.setVisibility(
+ mlir::SymbolTable::Visibility::Private);
+ }
+ rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
+ op, funcTy, cufLaunchKernel,
+ mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
+ adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
+ adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
+ adaptor.getBlockSizeZ(), dynamicMemorySize,
+ kernelArgs, nullPtr});
+ }
return mlir::success();
}
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index f10bd82f978dc4..7fede7c6c17b78 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -1,4 +1,4 @@
-// RUN: fir-opt --cuf-gpu-convert-to-llvm %s | FileCheck %s
+// RUN: fir-opt --split-input-file --cuf-gpu-convert-to-llvm %s | FileCheck %s
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func @_QMmod1Phost_sub() {
@@ -102,3 +102,25 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1 : !llvm.ptr
// CHECK: llvm.call @_FortranACUFLaunchKernel(%[[KERNEL_PTR]], {{.*}})
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
+ llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
+ llvm.return
+ }
+ llvm.func @_QQmain() attributes {fir.bindc_name = "test"} {
+ %0 = llvm.mlir.constant(1 : index) : i64
+ %1 = llvm.mlir.constant(2 : index) : i64
+ %2 = llvm.mlir.constant(0 : i32) : i32
+ %3 = llvm.mlir.constant(10 : index) : i64
+ gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%1, %1, %0) blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2
+ llvm.return
+ }
+ gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QQmain()
+// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
+// CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
…cluster dims are present (llvm#113959) Kernel launch in CUF are converted to `gpu.launch_func`. When the kernel has `cluster_dims` specified these get carried over to the `gpu.launch_func` operation. This patch updates the special conversion of `gpu.launch_func` when cluster dims are present to the newly added entry point.
Kernel launch in CUF are converted to
gpu.launch_func
. When the kernel hascluster_dims
specified these get carried over to thegpu.launch_func
operation. This patch updates the special conversion ofgpu.launch_func
when cluster dims are present to the newly added entry point.