Skip to content

Commit 563f414

Browse files
authored
[mlir][AMDGPU] Set uniform-work-group-size=true by default (llvm#79077)
GPU kernels generated via typical MLIR mechanisms make the assumption that all workgroups are of uniform size, and so, as in OpenMP, it is appropriate to set the "uniform-work-group-size"="true" attribute on these functions by default. This commit makes that choice. In the event it is needed, this commit adds `rocdl.uniform_work_group_size` as an attribute to be set on LLVM functions that can be used to override the default. In addition, add proper failure messages to translation
1 parent 7b11e2e commit 563f414

File tree

3 files changed

+61
-9
lines changed

3 files changed

+61
-9
lines changed

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,22 @@ def ROCDL_Dialect : Dialect {
2828
let hasOperationAttrVerify = 1;
2929

3030
let extraClassDeclaration = [{
31+
/// Get the name of the attribute used to annotate external kernel
32+
/// functions.
33+
static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
34+
static constexpr ::llvm::StringLiteral getFlatWorkGroupSizeAttrName() {
35+
return ::llvm::StringLiteral("rocdl.flat_work_group_size");
36+
}
37+
static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
38+
return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
39+
}
40+
/// MLIR's gpu-related infrastructure effectively assume uniform workgroup
41+
/// sizes, so this attribute defaults to "true" on `rocdl.kernel` functions.
42+
/// It is provided here to allow overriding this assumption.
43+
static constexpr ::llvm::StringLiteral getUniformWorkGroupSizeAttrName() {
44+
return ::llvm::StringLiteral("rocdl.uniform_work_group_size");
45+
}
46+
3147
/// The address space value that represents global memory.
3248
static constexpr unsigned kGlobalMemoryAddressSpace = 1;
3349
/// The address space value that represents shared memory.

mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ class ROCDLDialectLLVMIRTranslationInterface
8888
if (dialect->getKernelAttrHelper().getName() == attribute.getName()) {
8989
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
9090
if (!func)
91-
return failure();
91+
return op->emitOpError(Twine(attribute.getName()) +
92+
" is only supported on `llvm.func` operations");
93+
;
9294

9395
// For GPU kernels,
9496
// 1. Insert AMDGPU_KERNEL calling convention.
@@ -100,6 +102,13 @@ class ROCDLDialectLLVMIRTranslationInterface
100102
if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
101103
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
102104
}
105+
106+
// MLIR's GPU kernel APIs all assume and produce uniformly-sized
107+
// workgroups, so the lowering of the `rocdl.kernel` marker encodes this
108+
// assumption. This assumption may be overridden by setting
109+
// `rocdl.uniform_work_group_size` on a given function.
110+
if (!llvmFunc->hasFnAttribute("uniform-work-group-size"))
111+
llvmFunc->addFnAttr("uniform-work-group-size", "true");
103112
}
104113
// Override flat-work-group-size
105114
// TODO: update clients to rocdl.flat_work_group_size instead,
@@ -108,10 +117,12 @@ class ROCDLDialectLLVMIRTranslationInterface
108117
attribute.getName()) {
109118
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
110119
if (!func)
111-
return failure();
120+
return op->emitOpError(Twine(attribute.getName()) +
121+
" is only supported on `llvm.func` operations");
112122
auto value = dyn_cast<IntegerAttr>(attribute.getValue());
113123
if (!value)
114-
return failure();
124+
return op->emitOpError(Twine(attribute.getName()) +
125+
" must be an integer");
115126

116127
llvm::Function *llvmFunc =
117128
moduleTranslation.lookupFunction(func.getName());
@@ -124,27 +135,45 @@ class ROCDLDialectLLVMIRTranslationInterface
124135
attribute.getName()) {
125136
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
126137
if (!func)
127-
return failure();
138+
return op->emitOpError(Twine(attribute.getName()) +
139+
" is only supported on `llvm.func` operations");
128140
auto value = dyn_cast<StringAttr>(attribute.getValue());
129141
if (!value)
130-
return failure();
142+
return op->emitOpError(Twine(attribute.getName()) +
143+
" must be a string");
131144

132145
llvm::Function *llvmFunc =
133146
moduleTranslation.lookupFunction(func.getName());
134147
llvm::SmallString<8> llvmAttrValue;
135148
llvmAttrValue.append(value.getValue());
136149
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
137150
}
138-
151+
if (ROCDL::ROCDLDialect::getUniformWorkGroupSizeAttrName() ==
152+
attribute.getName()) {
153+
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
154+
if (!func)
155+
return op->emitOpError(Twine(attribute.getName()) +
156+
" is only supported on `llvm.func` operations");
157+
auto value = dyn_cast<BoolAttr>(attribute.getValue());
158+
if (!value)
159+
return op->emitOpError(Twine(attribute.getName()) +
160+
" must be a boolean");
161+
llvm::Function *llvmFunc =
162+
moduleTranslation.lookupFunction(func.getName());
163+
llvmFunc->addFnAttr("uniform-work-group-size",
164+
value.getValue() ? "true" : "false");
165+
}
139166
// Set reqd_work_group_size metadata
140167
if (dialect->getReqdWorkGroupSizeAttrHelper().getName() ==
141168
attribute.getName()) {
142169
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
143170
if (!func)
144-
return failure();
171+
return op->emitOpError(Twine(attribute.getName()) +
172+
" is only supported on `llvm.func` operations");
145173
auto value = dyn_cast<DenseI32ArrayAttr>(attribute.getValue());
146174
if (!value)
147-
return failure();
175+
return op->emitOpError(Twine(attribute.getName()) +
176+
" must be a dense i32 array attribute");
148177
llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
149178
SmallVector<llvm::Metadata *, 3> metadata;
150179
llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);

mlir/test/Target/LLVMIR/rocdl.mlir

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,12 @@ llvm.func @known_block_sizes()
5656
llvm.return
5757
}
5858

59+
llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} {
60+
// CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups()
61+
// CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]]
62+
llvm.return
63+
}
64+
5965
llvm.func @rocdl.lane_id() -> i32 {
6066
// CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
6167
// CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
@@ -505,8 +511,9 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
505511
llvm.return %source5 : i32
506512
}
507513

508-
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
514+
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
509515
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
510516
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
517+
// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" }
511518
// CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
512519
// CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}

0 commit comments

Comments
 (0)