Skip to content

[flang][cuda] Convert gpu.launch_func to CUFLaunchClusterKernel when cluster dims are present #113959

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion flang/include/flang/Runtime/CUDA/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,19 @@

extern "C" {

// This function uses intptr_t instead of CUDA's unsigned int to match
// These functions use intptr_t instead of CUDA's unsigned int to match
// the type of MLIR's index type. This avoids the need for casts in the
// generated MLIR code.

void RTDEF(CUFLaunchKernel)(const void *kernelName, intptr_t gridX,
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
intptr_t blockZ, int32_t smem, void **params, void **extra);

void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
int32_t smem, void **params, void **extra);

} // extern "C"

#endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_
83 changes: 53 additions & 30 deletions flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,6 @@ struct GPULaunchKernelConversion
mlir::LogicalResult
matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {

if (op.hasClusterSize()) {
return mlir::failure();
}

mlir::Location loc = op.getLoc();
auto *ctx = rewriter.getContext();
mlir::ModuleOp mod = op->getParentOfType<mlir::ModuleOp>();
Expand All @@ -107,37 +102,65 @@ struct GPULaunchKernelConversion
rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, kernel.getName());
}

auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
RTNAME_STRING(CUFLaunchKernel));

auto llvmIntPtrType = mlir::IntegerType::get(
ctx, this->getTypeConverter()->getPointerBitwidth(0));
auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
/*isVarArg=*/false);

auto cufLaunchKernel = mlir::SymbolRefAttr::get(
mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
if (!funcOp) {
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
rewriter.setInsertionPointToStart(mod.getBody());
auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
launchKernelFuncOp.setVisibility(mlir::SymbolTable::Visibility::Private);
}

mlir::Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, ptrTy);

rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
op, funcTy, cufLaunchKernel,
mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
adaptor.getBlockSizeZ(), dynamicMemorySize, kernelArgs,
nullPtr});
if (op.hasClusterSize()) {
auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
RTNAME_STRING(CUFLaunchClusterKernel));
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
/*isVarArg=*/false);
auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
if (!funcOp) {
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
rewriter.setInsertionPointToStart(mod.getBody());
auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
loc, RTNAME_STRING(CUFLaunchClusterKernel), funcTy);
launchKernelFuncOp.setVisibility(
mlir::SymbolTable::Visibility::Private);
}
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
op, funcTy, cufLaunchClusterKernel,
mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
adaptor.getClusterSizeY(), adaptor.getClusterSizeZ(),
adaptor.getGridSizeX(), adaptor.getGridSizeY(),
adaptor.getGridSizeZ(), adaptor.getBlockSizeX(),
adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
dynamicMemorySize, kernelArgs, nullPtr});
} else {
auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
RTNAME_STRING(CUFLaunchKernel));
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
/*isVarArg=*/false);
auto cufLaunchKernel = mlir::SymbolRefAttr::get(
mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
if (!funcOp) {
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
rewriter.setInsertionPointToStart(mod.getBody());
auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
launchKernelFuncOp.setVisibility(
mlir::SymbolTable::Visibility::Private);
}
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
op, funcTy, cufLaunchKernel,
mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
adaptor.getBlockSizeZ(), dynamicMemorySize,
kernelArgs, nullPtr});
}

return mlir::success();
}
Expand Down
25 changes: 24 additions & 1 deletion flang/runtime/CUDA/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,32 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
blockDim.x = blockX;
blockDim.y = blockY;
blockDim.z = blockZ;
cudaStream_t stream = 0;
cudaStream_t stream = 0; // TODO stream managment
CUDA_REPORT_IF_ERROR(
cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, stream));
}

void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
int32_t smem, void **params, void **extra) {
cudaLaunchConfig_t config;
config.gridDim.x = gridX;
config.gridDim.y = gridY;
config.gridDim.z = gridZ;
config.blockDim.x = blockX;
config.blockDim.y = blockY;
config.blockDim.z = blockZ;
config.dynamicSmemBytes = smem;
config.stream = 0; // TODO stream managment
cudaLaunchAttribute launchAttr[1];
launchAttr[0].id = cudaLaunchAttributeClusterDimension;
launchAttr[0].val.clusterDim.x = clusterX;
launchAttr[0].val.clusterDim.y = clusterY;
launchAttr[0].val.clusterDim.z = clusterZ;
config.numAttrs = 1;
config.attrs = launchAttr;
CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
}

} // extern "C"
24 changes: 23 additions & 1 deletion flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: fir-opt --cuf-gpu-convert-to-llvm %s | FileCheck %s
// RUN: fir-opt --split-input-file --cuf-gpu-convert-to-llvm %s | FileCheck %s

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func @_QMmod1Phost_sub() {
Expand Down Expand Up @@ -102,3 +102,25 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve

// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1 : !llvm.ptr
// CHECK: llvm.call @_FortranACUFLaunchKernel(%[[KERNEL_PTR]], {{.*}})

// -----

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
llvm.return
}
llvm.func @_QQmain() attributes {fir.bindc_name = "test"} {
%0 = llvm.mlir.constant(1 : index) : i64
%1 = llvm.mlir.constant(2 : index) : i64
%2 = llvm.mlir.constant(0 : i32) : i32
%3 = llvm.mlir.constant(10 : index) : i64
gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%1, %1, %0) blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2
llvm.return
}
gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
}

// CHECK-LABEL: llvm.func @_QQmain()
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
// CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
Loading