Skip to content

Commit a12f902

Browse files
committed
[mlir][AMDGPU] Set uniform-work-group-size=true by default
GPU kernels generated via typical MLIR mechanisms make the assumption that all workgroups are of uniform size, and so, as in OpenMP, it is appropriate to set the "uniform-work-group-size"="true" attribute on these functions by default. This commit makes that choiec. In the event it is needed,t his commit adds `rocdl.uniform_wrok_group_size` as an attribute to be set on LLVM functions that can be used to override the default.
1 parent 6953b36 commit a12f902

File tree

3 files changed

+33
-2
lines changed

3 files changed

+33
-2
lines changed

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ def ROCDL_Dialect : Dialect {
3737
static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
3838
return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
3939
}
40+
/// MLIR's gpu-related infrastructure effectively assume uniform workgroup
41+
/// sizes, so this attribute defaults to "true" on `rocdl.kernel` functions.
42+
/// It is provided here to allow overriding this assumption.
43+
static constexpr ::llvm::StringLiteral getUniformWorkGroupSizeAttrName() {
44+
return ::llvm::StringLiteral("rocdl.uniform_work_group_size");
45+
}
4046

4147
/// The address space value that represents global memory.
4248
static constexpr unsigned kGlobalMemoryAddressSpace = 1;

mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,12 @@ class ROCDLDialectLLVMIRTranslationInterface
9999
if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
100100
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
101101
}
102+
// MLIR's GPU kernel APIs all assume and produce uniformly-sized
103+
// workgroups, so the lowering of the `rocdl.kernel` marker encodes this
104+
// assumption. This assumption may be overridden by setting
105+
// `rocdl.uniform_work_group_size` on a given function.
106+
if (!llvmFunc->hasFnAttribute("uniform-work-group-size"))
107+
llvmFunc->addFnAttr("uniform-work-group-size", "true");
102108
}
103109
// Override flat-work-group-size
104110
// TODO: update clients to rocdl.flat_work_group_size instead,
@@ -133,7 +139,19 @@ class ROCDLDialectLLVMIRTranslationInterface
133139
llvmAttrValue.append(value.getValue());
134140
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
135141
}
136-
142+
if (ROCDL::ROCDLDialect::getUniformWorkGroupSizeAttrName() ==
143+
attribute.getName()) {
144+
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
145+
if (!func)
146+
return failure();
147+
auto value = dyn_cast<BoolAttr>(attribute.getValue());
148+
if (!value)
149+
return failure();
150+
llvm::Function *llvmFunc =
151+
moduleTranslation.lookupFunction(func.getName());
152+
llvmFunc->addFnAttr("uniform-work-group-size",
153+
value.getValue() ? "true" : "false");
154+
}
137155
// Set reqd_work_group_size metadata
138156
if (ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName() ==
139157
attribute.getName()) {

mlir/test/Target/LLVMIR/rocdl.mlir

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,12 @@ llvm.func @known_block_sizes()
5656
llvm.return
5757
}
5858

59+
llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} {
60+
// CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups()
61+
// CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]]
62+
llvm.return
63+
}
64+
5965
llvm.func @rocdl.lane_id() -> i32 {
6066
// CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
6167
// CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
@@ -489,8 +495,9 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
489495
llvm.return %source5 : i32
490496
}
491497

492-
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
498+
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
493499
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
494500
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
501+
// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" }
495502
// CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
496503
// CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}

0 commit comments

Comments
 (0)