-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Mark grid size loads with range metadata #113019
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesOnly handles the v5 case. Full diff: https://github.com/llvm/llvm-project/pull/113019.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 1bb5e794da7dd6..5fc0c36359b6f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
@@ -83,6 +84,20 @@ Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
} // end anonymous namespace
+static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
+ uint32_t MaxNumGroups) {
+ if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
+ return;
+
+ if (!Load->getType()->isIntegerTy(32))
+ return;
+
+ // TODO: If there is existing range metadata, preserve it if it is stricter.
+ MDBuilder MDB(Load->getContext());
+ MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
+ Load->setMetadata(LLVMContext::MD_range, Range);
+}
+
static bool processUse(CallInst *CI, bool IsV5OrAbove) {
Function *F = CI->getParent()->getParent();
@@ -92,7 +107,11 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
const bool HasUniformWorkGroupSize =
F->getFnAttribute("uniform-work-group-size").getValueAsBool();
- if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
+ SmallVector<unsigned> MaxNumWorkgroups =
+ AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", 3);
+
+ if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+ none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
return false;
Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
@@ -133,16 +152,22 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
if (IsV5OrAbove) { // Base is ImplicitArgPtr.
switch (Offset) {
case HIDDEN_BLOCK_COUNT_X:
- if (LoadSize == 4)
+ if (LoadSize == 4) {
BlockCounts[0] = Load;
+ annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
+ }
break;
case HIDDEN_BLOCK_COUNT_Y:
- if (LoadSize == 4)
+ if (LoadSize == 4) {
BlockCounts[1] = Load;
+ annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
+ }
break;
case HIDDEN_BLOCK_COUNT_Z:
- if (LoadSize == 4)
+ if (LoadSize == 4) {
BlockCounts[2] = Load;
+ annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
+ }
break;
case HIDDEN_GROUP_SIZE_X:
if (LoadSize == 2)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 961a9220b48d6b..5a899b755419ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -369,6 +369,7 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct
TM.getSubtarget<R600Subtarget>(F));
}
+// FIXME: This has no reason to be in subtarget
SmallVector<unsigned>
AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
new file mode 100644
index 00000000000000..9064292129928f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s
+
+define i32 @use_grid_size_x_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0:![0-9]+]]
+; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+ ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]]
+; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4, !range !0
+ ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_y_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_y_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4
+; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]]
+; CHECK-NEXT: ret i32 [[GRID_SIZE_Y]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.grid.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 4
+ %grid.size.y = load i32, ptr addrspace(4) %gep.grid.size.y, align 4
+ ret i32 %grid.size.y
+}
+
+define i32 @use_grid_size_z_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_z_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8
+; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]]
+; CHECK-NEXT: ret i32 [[GRID_SIZE_Z]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep.grid.size.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 8
+ %grid.size.z = load i32, ptr addrspace(4) %gep.grid.size.z, align 4
+ ret i32 %grid.size.z
+}
+
+define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type() #0 {
+; CHECK-LABEL: define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load <2 x i16>, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT: ret <2 x i16> [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load <2 x i16>, ptr addrspace(4) %implicitarg.ptr, align 4
+ ret <2 x i16> %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]]
+; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+ ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_max() #2 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+ ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_zero() #3 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_zero(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+ ret i32 %grid.size.x
+}
+
+declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3
+
+attributes #0 = { "amdgpu-max-num-workgroups"="36,42,89" }
+attributes #1 = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
+attributes #2 = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
+attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" }
+
+!0 = !{i32 0, i32 -1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[RNG0]] = !{i32 1, i32 37}
+; CHECK: [[RNG1]] = !{i32 1, i32 43}
+; CHECK: [[RNG2]] = !{i32 1, i32 90}
+; CHECK: [[RNG3]] = !{i32 1, i32 -1}
+;.
|
2005b26
to
58a330c
Compare
297173f
to
5336b21
Compare
58a330c
to
b0cdf6f
Compare
5336b21
to
0bd73d5
Compare
b0cdf6f
to
d26f42b
Compare
0bd73d5
to
8c4d57e
Compare
d26f42b
to
8272a93
Compare
8c4d57e
to
64d530b
Compare
8272a93
to
b5edcd1
Compare
64d530b
to
63a1993
Compare
I'm not sure what the interpretation of 0 is supposed to be, AMDGPUUsage doesn't say.
Only handles the v5 case.
b5edcd1
to
2ad2f35
Compare
63a1993
to
cc4a772
Compare
How are we gonna use the three sizes after we annotate them? What is the impact of using an inaccurate number? |
It enables known bits style optimizations. I think we're introducing these a bit late though, eventually we should probably move this directly intro the attributor. The big win would be enabling shrinking 64-bit work item calculations to 32-bit |
Only handles the v5 case.