Skip to content

Commit 763109e

Browse files
authored
[mlir][gpu] Use known_block_size to set maxntid for NVVM target (#77301)
Setting thread block size with `maxntid` on the kernel has great performance benefits. In this way, downstream PTX compiler can do better register allocation. MLIR's `gpu.launch` and `gpu.launch_func` already has an attribute (`known_block_size`) that keeps the thread block size when it is known. This PR simply uses this attribute to set `maxntid`.
1 parent 2edce42 commit 763109e

File tree

4 files changed

+40
-6
lines changed

4 files changed

+40
-6
lines changed

mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,26 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
8585
// Add a dialect specific kernel attribute in addition to GPU kernel
8686
// attribute. The former is necessary for further translation while the
8787
// latter is expected by gpu.launch_func.
88-
if (gpuFuncOp.isKernel())
88+
if (gpuFuncOp.isKernel()) {
8989
attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
90+
91+
// Set the block size attribute if it is present.
92+
if (kernelBlockSizeAttributeName.has_value()) {
93+
std::optional<int32_t> dimX =
94+
gpuFuncOp.getKnownBlockSize(gpu::Dimension::x);
95+
std::optional<int32_t> dimY =
96+
gpuFuncOp.getKnownBlockSize(gpu::Dimension::y);
97+
std::optional<int32_t> dimZ =
98+
gpuFuncOp.getKnownBlockSize(gpu::Dimension::z);
99+
if (dimX.has_value() || dimY.has_value() || dimZ.has_value()) {
100+
// If any of the dimensions are missing, fill them in with 1.
101+
attributes.emplace_back(
102+
kernelBlockSizeAttributeName.value(),
103+
rewriter.getI32ArrayAttr(
104+
{dimX.value_or(1), dimY.value_or(1), dimZ.value_or(1)}));
105+
}
106+
}
107+
}
90108
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
91109
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
92110
LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C,

mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,15 @@ struct GPUDynamicSharedMemoryOpLowering
3636
};
3737

3838
struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
39-
GPUFuncOpLowering(const LLVMTypeConverter &converter,
40-
unsigned allocaAddrSpace, unsigned workgroupAddrSpace,
41-
StringAttr kernelAttributeName)
39+
GPUFuncOpLowering(
40+
const LLVMTypeConverter &converter, unsigned allocaAddrSpace,
41+
unsigned workgroupAddrSpace, StringAttr kernelAttributeName,
42+
std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt)
4243
: ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
4344
allocaAddrSpace(allocaAddrSpace),
4445
workgroupAddrSpace(workgroupAddrSpace),
45-
kernelAttributeName(kernelAttributeName) {}
46+
kernelAttributeName(kernelAttributeName),
47+
kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {}
4648

4749
LogicalResult
4850
matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
@@ -56,6 +58,9 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
5658

5759
/// The attribute name to use instead of `gpu.kernel`.
5860
StringAttr kernelAttributeName;
61+
62+
/// The attribute name to to set block size
63+
std::optional<StringAttr> kernelBlockSizeAttributeName;
5964
};
6065

6166
/// The lowering of gpu.printf to a call to HIP hostcalls

mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,9 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
352352
/*workgroupAddrSpace=*/
353353
static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
354354
StringAttr::get(&converter.getContext(),
355-
NVVM::NVVMDialect::getKernelFuncAttrName()));
355+
NVVM::NVVMDialect::getKernelFuncAttrName()),
356+
StringAttr::get(&converter.getContext(),
357+
NVVM::NVVMDialect::getMaxntidAttrName()));
356358

357359
populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
358360
"__nv_fabs");

mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,15 @@ gpu.module @test_module_31 {
627627
}
628628
}
629629

630+
gpu.module @gpumodule {
631+
// CHECK-LABEL: func @kernel_with_block_size()
632+
// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = [128 : i32, 1 : i32, 1 : i32]}
633+
gpu.func @kernel_with_block_size() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
634+
gpu.return
635+
}
636+
}
637+
638+
630639
module attributes {transform.with_named_sequence} {
631640
transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
632641
%gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module

0 commit comments

Comments
 (0)