Skip to content

Commit f25985d

Browse files
committed
[mlir][AMDGPU] Set uniform-work-group-size=true by default
GPU kernels generated via typical MLIR mechanisms make the assumption that all workgroups are of uniform size, and so, as in OpenMP, it is appropriate to set the "uniform-work-group-size"="true" attribute on these functions by default. This commit makes that choiec. In the event it is needed,t his commit adds `rocdl.uniform_wrok_group_size` as an attribute to be set on LLVM functions that can be used to override the default. In addition, add proper failure messages to translation
1 parent 4cba595 commit f25985d

File tree

3 files changed

+61
-9
lines changed

3 files changed

+61
-9
lines changed

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,22 @@ def ROCDL_Dialect : Dialect {
2828
let hasOperationAttrVerify = 1;
2929

3030
let extraClassDeclaration = [{
31+
/// Get the name of the attribute used to annotate external kernel
32+
/// functions.
33+
static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
34+
static constexpr ::llvm::StringLiteral getFlatWorkGroupSizeAttrName() {
35+
return ::llvm::StringLiteral("rocdl.flat_work_group_size");
36+
}
37+
static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
38+
return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
39+
}
40+
/// MLIR's gpu-related infrastructure effectively assume uniform workgroup
41+
/// sizes, so this attribute defaults to "true" on `rocdl.kernel` functions.
42+
/// It is provided here to allow overriding this assumption.
43+
static constexpr ::llvm::StringLiteral getUniformWorkGroupSizeAttrName() {
44+
return ::llvm::StringLiteral("rocdl.uniform_work_group_size");
45+
}
46+
3147
/// The address space value that represents global memory.
3248
static constexpr unsigned kGlobalMemoryAddressSpace = 1;
3349
/// The address space value that represents shared memory.

mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ class ROCDLDialectLLVMIRTranslationInterface
8888
if (dialect->getKernelAttrHelper().getName() == attribute.getName()) {
8989
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
9090
if (!func)
91-
return failure();
91+
return op->emitOpError(Twine(attribute.getName()) +
92+
" is only supported on `llvm.func` operations");
93+
;
9294

9395
// For GPU kernels,
9496
// 1. Insert AMDGPU_KERNEL calling convention.
@@ -100,6 +102,13 @@ class ROCDLDialectLLVMIRTranslationInterface
100102
if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
101103
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
102104
}
105+
106+
// MLIR's GPU kernel APIs all assume and produce uniformly-sized
107+
// workgroups, so the lowering of the `rocdl.kernel` marker encodes this
108+
// assumption. This assumption may be overridden by setting
109+
// `rocdl.uniform_work_group_size` on a given function.
110+
if (!llvmFunc->hasFnAttribute("uniform-work-group-size"))
111+
llvmFunc->addFnAttr("uniform-work-group-size", "true");
103112
}
104113
// Override flat-work-group-size
105114
// TODO: update clients to rocdl.flat_work_group_size instead,
@@ -108,10 +117,12 @@ class ROCDLDialectLLVMIRTranslationInterface
108117
attribute.getName()) {
109118
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
110119
if (!func)
111-
return failure();
120+
return op->emitOpError(Twine(attribute.getName()) +
121+
" is only supported on `llvm.func` operations");
112122
auto value = dyn_cast<IntegerAttr>(attribute.getValue());
113123
if (!value)
114-
return failure();
124+
return op->emitOpError(Twine(attribute.getName()) +
125+
" must be an integer");
115126

116127
llvm::Function *llvmFunc =
117128
moduleTranslation.lookupFunction(func.getName());
@@ -124,27 +135,45 @@ class ROCDLDialectLLVMIRTranslationInterface
124135
attribute.getName()) {
125136
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
126137
if (!func)
127-
return failure();
138+
return op->emitOpError(Twine(attribute.getName()) +
139+
" is only supported on `llvm.func` operations");
128140
auto value = dyn_cast<StringAttr>(attribute.getValue());
129141
if (!value)
130-
return failure();
142+
return op->emitOpError(Twine(attribute.getName()) +
143+
" must be a string");
131144

132145
llvm::Function *llvmFunc =
133146
moduleTranslation.lookupFunction(func.getName());
134147
llvm::SmallString<8> llvmAttrValue;
135148
llvmAttrValue.append(value.getValue());
136149
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
137150
}
138-
151+
if (ROCDL::ROCDLDialect::getUniformWorkGroupSizeAttrName() ==
152+
attribute.getName()) {
153+
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
154+
if (!func)
155+
return op->emitOpError(Twine(attribute.getName()) +
156+
" is only supported on `llvm.func` operations");
157+
auto value = dyn_cast<BoolAttr>(attribute.getValue());
158+
if (!value)
159+
return op->emitOpError(Twine(attribute.getName()) +
160+
" must be a boolean");
161+
llvm::Function *llvmFunc =
162+
moduleTranslation.lookupFunction(func.getName());
163+
llvmFunc->addFnAttr("uniform-work-group-size",
164+
value.getValue() ? "true" : "false");
165+
}
139166
// Set reqd_work_group_size metadata
140167
if (dialect->getReqdWorkGroupSizeAttrHelper().getName() ==
141168
attribute.getName()) {
142169
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
143170
if (!func)
144-
return failure();
171+
return op->emitOpError(Twine(attribute.getName()) +
172+
" is only supported on `llvm.func` operations");
145173
auto value = dyn_cast<DenseI32ArrayAttr>(attribute.getValue());
146174
if (!value)
147-
return failure();
175+
return op->emitOpError(Twine(attribute.getName()) +
176+
" must be a dense i32 array attribute");
148177
llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
149178
SmallVector<llvm::Metadata *, 3> metadata;
150179
llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);

mlir/test/Target/LLVMIR/rocdl.mlir

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,12 @@ llvm.func @known_block_sizes()
5656
llvm.return
5757
}
5858

59+
llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} {
60+
// CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups()
61+
// CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]]
62+
llvm.return
63+
}
64+
5965
llvm.func @rocdl.lane_id() -> i32 {
6066
// CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
6167
// CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
@@ -505,8 +511,9 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
505511
llvm.return %source5 : i32
506512
}
507513

508-
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
514+
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
509515
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
510516
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
517+
// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" }
511518
// CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
512519
// CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}

0 commit comments

Comments
 (0)