Skip to content

[mlir][AMDGPU] Set uniform-work-group-size=true by default #79077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,22 @@ def ROCDL_Dialect : Dialect {
let hasOperationAttrVerify = 1;

let extraClassDeclaration = [{
/// Get the name of the attribute used to annotate external kernel
/// functions.
static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
static constexpr ::llvm::StringLiteral getFlatWorkGroupSizeAttrName() {
return ::llvm::StringLiteral("rocdl.flat_work_group_size");
}
static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
}
/// MLIR's gpu-related infrastructure effectively assume uniform workgroup
/// sizes, so this attribute defaults to "true" on `rocdl.kernel` functions.
/// It is provided here to allow overriding this assumption.
static constexpr ::llvm::StringLiteral getUniformWorkGroupSizeAttrName() {
return ::llvm::StringLiteral("rocdl.uniform_work_group_size");
}

/// The address space value that represents global memory.
static constexpr unsigned kGlobalMemoryAddressSpace = 1;
/// The address space value that represents shared memory.
Expand Down
45 changes: 37 additions & 8 deletions mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ class ROCDLDialectLLVMIRTranslationInterface
if (dialect->getKernelAttrHelper().getName() == attribute.getName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
return failure();
return op->emitOpError(Twine(attribute.getName()) +
" is only supported on `llvm.func` operations");
;

// For GPU kernels,
// 1. Insert AMDGPU_KERNEL calling convention.
Expand All @@ -100,6 +102,13 @@ class ROCDLDialectLLVMIRTranslationInterface
if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
}

// MLIR's GPU kernel APIs all assume and produce uniformly-sized
// workgroups, so the lowering of the `rocdl.kernel` marker encodes this
// assumption. This assumption may be overridden by setting
// `rocdl.uniform_work_group_size` on a given function.
if (!llvmFunc->hasFnAttribute("uniform-work-group-size"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need to check if the attribute is already set on the function? Aren't you constructing this fresh IR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have guarantees about whether we'll process rocdl.kernel or rocdl.uniform_work_group_size first, so if I've already encountered a manual definition of uniform work group size, I don't want to overwrite it with the default

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like a peculiar process but I have no idea how MLIR does things. I'd expect it to look like clang where the full set of attributes is considered when initially emitting the IR attributes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this is a hook amendOperation. It gets called when you have a rocdl attribute on an operation being translated, once per attribute.
So given

llvm.func @foo(...)  attributes {llvm.something = N : i64, rocdl.kernel, rocdl.uniform_work_group_size = true}

The LLVM dialect is responsible for translating llvm.func into LLVM IR, and then you'll end up with three amendOperation() calls in some order, one to the LLVM dialect's, two to ROCDL's.

llvmFunc->addFnAttr("uniform-work-group-size", "true");
}
// Override flat-work-group-size
// TODO: update clients to rocdl.flat_work_group_size instead,
Expand All @@ -108,10 +117,12 @@ class ROCDLDialectLLVMIRTranslationInterface
attribute.getName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
return failure();
return op->emitOpError(Twine(attribute.getName()) +
" is only supported on `llvm.func` operations");
auto value = dyn_cast<IntegerAttr>(attribute.getValue());
if (!value)
return failure();
return op->emitOpError(Twine(attribute.getName()) +
" must be an integer");

llvm::Function *llvmFunc =
moduleTranslation.lookupFunction(func.getName());
Expand All @@ -124,27 +135,45 @@ class ROCDLDialectLLVMIRTranslationInterface
attribute.getName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
return failure();
return op->emitOpError(Twine(attribute.getName()) +
" is only supported on `llvm.func` operations");
auto value = dyn_cast<StringAttr>(attribute.getValue());
if (!value)
return failure();
return op->emitOpError(Twine(attribute.getName()) +
" must be a string");

llvm::Function *llvmFunc =
moduleTranslation.lookupFunction(func.getName());
llvm::SmallString<8> llvmAttrValue;
llvmAttrValue.append(value.getValue());
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
}

if (ROCDL::ROCDLDialect::getUniformWorkGroupSizeAttrName() ==
attribute.getName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
return op->emitOpError(Twine(attribute.getName()) +
" is only supported on `llvm.func` operations");
auto value = dyn_cast<BoolAttr>(attribute.getValue());
if (!value)
return op->emitOpError(Twine(attribute.getName()) +
" must be a boolean");
llvm::Function *llvmFunc =
moduleTranslation.lookupFunction(func.getName());
llvmFunc->addFnAttr("uniform-work-group-size",
value.getValue() ? "true" : "false");
}
// Set reqd_work_group_size metadata
if (dialect->getReqdWorkGroupSizeAttrHelper().getName() ==
attribute.getName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
return failure();
return op->emitOpError(Twine(attribute.getName()) +
" is only supported on `llvm.func` operations");
auto value = dyn_cast<DenseI32ArrayAttr>(attribute.getValue());
if (!value)
return failure();
return op->emitOpError(Twine(attribute.getName()) +
" must be a dense i32 array attribute");
llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
SmallVector<llvm::Metadata *, 3> metadata;
llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
Expand Down
9 changes: 8 additions & 1 deletion mlir/test/Target/LLVMIR/rocdl.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ llvm.func @known_block_sizes()
llvm.return
}

llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} {
// CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups()
// CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]]
llvm.return
}

llvm.func @rocdl.lane_id() -> i32 {
// CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
// CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
Expand Down Expand Up @@ -505,8 +511,9 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
llvm.return %source5 : i32
}

// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" }
// CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
// CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}