Skip to content

Commit 1eedc8a

Browse files
committed
[mlir][AMDGPU] Set uniform-work-group-size=true by default
GPU kernels generated via typical MLIR mechanisms make the assumption that all workgroups are of uniform size, and so, as in OpenMP, it is appropriate to set the "uniform-work-group-size"="true" attribute on these functions by default. This commit makes that choiec. In the event it is needed,t his commit adds `rocdl.uniform_wrok_group_size` as an attribute to be set on LLVM functions that can be used to override the default.
1 parent eabddf2 commit 1eedc8a

File tree

3 files changed

+33
-2
lines changed

3 files changed

+33
-2
lines changed

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ def ROCDL_Dialect : Dialect {
3737
static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
3838
return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
3939
}
40+
/// MLIR's gpu-related infrastructure effectively assume uniform workgroup
41+
/// sizes, so this attribute defaults to "true" on `rocdl.kernel` functions.
42+
/// It is provided here to allow overriding this assumption.
43+
static constexpr ::llvm::StringLiteral getUniformWorkGroupSizeAttrName() {
44+
return ::llvm::StringLiteral("rocdl.uniform_work_group_size");
45+
}
4046

4147
/// The address space value that represents global memory.
4248
static constexpr unsigned kGlobalMemoryAddressSpace = 1;

mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,12 @@ class ROCDLDialectLLVMIRTranslationInterface
100100
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
101101
}
102102

103+
// MLIR's GPU kernel APIs all assume and produce uniformly-sized
104+
// workgroups, so the lowering of the `rocdl.kernel` marker encodes this
105+
// assumption. This assumption may be overridden by setting
106+
// `rocdl.uniform_work_group_size` on a given function.
107+
if (!llvmFunc->hasFnAttribute("uniform-work-group-size"))
108+
llvmFunc->addFnAttr("uniform-work-group-size", "true");
103109
}
104110
// Override flat-work-group-size
105111
// TODO: update clients to rocdl.flat_work_group_size instead,
@@ -134,7 +140,19 @@ class ROCDLDialectLLVMIRTranslationInterface
134140
llvmAttrValue.append(value.getValue());
135141
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
136142
}
137-
143+
if (ROCDL::ROCDLDialect::getUniformWorkGroupSizeAttrName() ==
144+
attribute.getName()) {
145+
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
146+
if (!func)
147+
return failure();
148+
auto value = dyn_cast<BoolAttr>(attribute.getValue());
149+
if (!value)
150+
return failure();
151+
llvm::Function *llvmFunc =
152+
moduleTranslation.lookupFunction(func.getName());
153+
llvmFunc->addFnAttr("uniform-work-group-size",
154+
value.getValue() ? "true" : "false");
155+
}
138156
// Set reqd_work_group_size metadata
139157
if (ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName() ==
140158
attribute.getName()) {

mlir/test/Target/LLVMIR/rocdl.mlir

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,12 @@ llvm.func @known_block_sizes()
5656
llvm.return
5757
}
5858

59+
llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} {
60+
// CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups()
61+
// CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]]
62+
llvm.return
63+
}
64+
5965
llvm.func @rocdl.lane_id() -> i32 {
6066
// CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
6167
// CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
@@ -489,8 +495,9 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
489495
llvm.return %source5 : i32
490496
}
491497

492-
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
498+
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
493499
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
494500
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
501+
// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" }
495502
// CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
496503
// CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}

0 commit comments

Comments
 (0)