-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][Attributor] Infer inreg
attribute in AMDGPUAttributor
#101609
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
✅ With the latest revision this PR passed the C/C++ code formatter. |
f4b9eff
to
80c0a95
Compare
80c0a95
to
3687710
Compare
3687710
to
f17ce96
Compare
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesThis patch introduces In addition, this AA is also able to propagate the Full diff: https://github.com/llvm/llvm-project/pull/101609.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 72049f0aa6b86e..32b27aa71cc9ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -14,6 +14,7 @@
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CycleAnalysis.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
@@ -1014,6 +1015,97 @@ struct AAAMDGPUNoAGPR
const char AAAMDGPUNoAGPR::ID = 0;
+struct AAAMDGPUInreg
+ : public IRAttribute<Attribute::InReg,
+ StateWrapper<BooleanState, AbstractAttribute>,
+ AAAMDGPUInreg> {
+ AAAMDGPUInreg(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
+
+ /// Create an abstract attribute view for the position \p IRP.
+ static AAAMDGPUInreg &createForPosition(const IRPosition &IRP, Attributor &A);
+
+ /// See AbstractAttribute::getName()
+ const std::string getName() const override { return "AAAMDGPUInreg"; }
+
+ const std::string getAsStr(Attributor *A) const override {
+ return getAssumed() ? "inreg" : "non-inreg";
+ }
+
+ void trackStatistics() const override {}
+
+ /// See AbstractAttribute::getIdAddr()
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is AAAMDGPUInreg
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ /// Unique ID (due to the unique address)
+ static const char ID;
+};
+
+const char AAAMDGPUInreg::ID = 0;
+
+namespace {
+
+struct AAAMDGPUInregArgument : public AAAMDGPUInreg {
+ AAAMDGPUInregArgument(const IRPosition &IRP, Attributor &A)
+ : AAAMDGPUInreg(IRP, A) {}
+
+ void initialize(Attributor &A) override {
+ if (getAssociatedArgument()->hasAttribute(Attribute::InReg))
+ indicateOptimisticFixpoint();
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ unsigned ArgNo = getAssociatedArgument()->getArgNo();
+
+ auto Pred = [&](AbstractCallSite ACS) -> bool {
+ CallBase *CB = ACS.getInstruction();
+ Value *V = CB->getArgOperandUse(ArgNo);
+ if (auto *G = dyn_cast<GlobalValue>(V))
+ return true;
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ auto AU = A.getInfoCache()
+ .getAnalysisResultForFunction<UniformityInfoAnalysis>(
+ *I->getFunction());
+ return AU && AU->isUniform(I);
+ }
+ if (auto *Arg = dyn_cast<Argument>(V)) {
+ auto *AA =
+ A.getOrCreateAAFor<AAAMDGPUInreg>(IRPosition::argument(*Arg));
+ return AA && AA->isValidState();
+ }
+ // For unforeseen cases, we need to assume it is not uniform thus not
+ // qualified for inreg.
+ return false;
+ };
+
+ bool UsedAssumedInformation = false;
+ if (!A.checkForAllCallSites(Pred, *this, /*RequireAllCallSites=*/true,
+ UsedAssumedInformation))
+ return indicatePessimisticFixpoint();
+
+ if (!UsedAssumedInformation)
+ return indicateOptimisticFixpoint();
+
+ return ChangeStatus::UNCHANGED;
+ }
+};
+
+} // namespace
+
+AAAMDGPUInreg &AAAMDGPUInreg::createForPosition(const IRPosition &IRP,
+ Attributor &A) {
+ switch (IRP.getPositionKind()) {
+ case IRPosition::IRP_ARGUMENT:
+ return *new (A.Allocator) AAAMDGPUInregArgument(IRP, A);
+ default:
+ llvm_unreachable("not a valid position for AAAMDGPUInreg");
+ }
+}
+
static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
for (unsigned I = 0;
@@ -1046,7 +1138,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
&AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
- &AAInstanceInfo::ID});
+ &AAInstanceInfo::ID, &AAAMDGPUInreg::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1090,6 +1182,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
IRPosition::value(*SI->getPointerOperand()));
}
}
+
+ if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL) {
+ for (auto &Arg : F.args())
+ A.getOrCreateAAFor<AAAMDGPUInreg>(IRPosition::argument(Arg));
+ }
}
ChangeStatus Change = A.run();
@@ -1118,6 +1215,7 @@ class AMDGPUAttributorLegacy : public ModulePass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<CycleInfoWrapperPass>();
+ AU.addRequired<UniformityInfoWrapperPass>();
}
StringRef getPassName() const override { return "AMDGPU Attributor"; }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
index d58a62408427dc..4f46e08921a49b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
@@ -8,7 +8,7 @@
define internal fastcc void @foo(ptr %kg) {
; CHECK-LABEL: define internal fastcc void @foo(
-; CHECK-SAME: ptr [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr inreg [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CLOSURE_I25_I:%.*]] = getelementptr i8, ptr [[KG]], i64 336
; CHECK-NEXT: [[NUM_CLOSURE_I26_I:%.*]] = getelementptr i8, ptr [[KG]], i64 276
diff --git a/llvm/test/CodeGen/AMDGPU/inreg-inference.ll b/llvm/test/CodeGen/AMDGPU/inreg-inference.ll
new file mode 100644
index 00000000000000..94e5e700a78b14
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inreg-inference.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor %s -o - | FileCheck %s
+
+@g1 = protected addrspace(1) externally_initialized global i32 0, align 4
+@g2 = protected addrspace(1) externally_initialized global i32 0, align 4
+
+;.
+; CHECK: @g1 = protected addrspace(1) externally_initialized global i32 0, align 4
+; CHECK: @g2 = protected addrspace(1) externally_initialized global i32 0, align 4
+;.
+define internal fastcc void @f(ptr %x, ptr %y) {
+; CHECK-LABEL: define {{[^@]+}}@f
+; CHECK-SAME: (ptr inreg [[X:%.*]], ptr inreg [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g1, align 4
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT: store i32 [[LOAD]], ptr addrspace(1) @g2, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %x.val = load i32, ptr %x, align 4
+ store i32 %x.val, ptr addrspace(1) @g1, align 4
+ %load = load i32, ptr %y, align 4
+ store i32 %load, ptr addrspace(1) @g2, align 4
+ ret void
+}
+
+define protected amdgpu_kernel void @kernel(ptr addrspace(1) %x2, i32 %z) {
+; CHECK-LABEL: define {{[^@]+}}@kernel
+; CHECK-SAME: (ptr addrspace(1) [[X2:%.*]], i32 [[Z:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[X2_CAST:%.*]] = addrspacecast ptr addrspace(1) [[X2]] to ptr
+; CHECK-NEXT: [[QUEUE_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; CHECK-NEXT: [[QUEUE_PTR_CAST:%.*]] = addrspacecast ptr addrspace(4) [[QUEUE_PTR]] to ptr
+; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT: [[IMPLICITARG_PTR_CAST:%.*]] = addrspacecast ptr addrspace(4) [[IMPLICITARG_PTR]] to ptr
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[Z]], 0
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], ptr [[QUEUE_PTR_CAST]], ptr [[X2_CAST]]
+; CHECK-NEXT: tail call fastcc void @f(ptr [[COND]], ptr noundef [[IMPLICITARG_PTR_CAST]])
+; CHECK-NEXT: [[DOTVAL:%.*]] = load i32, ptr addrspace(4) [[QUEUE_PTR]], align 4
+; CHECK-NEXT: tail call fastcc void @f(ptr [[COND]], ptr noundef [[IMPLICITARG_PTR_CAST]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %x2.cast = addrspacecast ptr addrspace(1) %x2 to ptr
+ %queue.ptr = tail call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+ %queue.ptr.cast = addrspacecast ptr addrspace(4) %queue.ptr to ptr
+ %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %implicitarg.ptr.cast = addrspacecast ptr addrspace(4) %implicitarg.ptr to ptr
+ %cmp = icmp sgt i32 %z, 0
+ %cond = select i1 %cmp, ptr %queue.ptr.cast, ptr %x2.cast
+ tail call fastcc void @f(ptr %cond, ptr noundef %implicitarg.ptr.cast)
+ %.val = load i32, ptr addrspace(4) %queue.ptr, align 4
+ tail call fastcc void @f(ptr %cond, ptr noundef %implicitarg.ptr.cast)
+ ret void
+}
+
+declare align 4 ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+
+declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 384a9c4043a1d3..65a6322dd4730a 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -8,11 +8,11 @@
@recursive.kernel.lds = addrspace(3) global i16 poison
;.
-; CHECK: @[[LLVM_AMDGCN_KERNEL_K0_F0_LDS:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [[LLVM_AMDGCN_KERNEL_K0_F0_LDS_T:%.*]] poison, align 2, !absolute_symbol !0
-; CHECK: @[[LLVM_AMDGCN_KERNEL_K1_F0_LDS:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [[LLVM_AMDGCN_KERNEL_K1_F0_LDS_T:%.*]] poison, align 2, !absolute_symbol !0
-; CHECK: @[[LLVM_AMDGCN_KERNEL_KERNEL_LDS_LDS:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [[LLVM_AMDGCN_KERNEL_KERNEL_LDS_LDS_T:%.*]] poison, align 2, !absolute_symbol !0
-; CHECK: @[[LLVM_AMDGCN_KERNEL_KERNEL_LDS_RECURSION_LDS:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [[LLVM_AMDGCN_KERNEL_KERNEL_LDS_RECURSION_LDS_T:%.*]] poison, align 2, !absolute_symbol !0
-; CHECK: @[[LLVM_AMDGCN_LDS_OFFSET_TABLE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(4) constant [3 x [2 x i32]]
+; CHECK: @llvm.amdgcn.kernel.k0_f0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0_f0.lds.t poison, align 2, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.amdgcn.kernel.k1_f0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1_f0.lds.t poison, align 2, !absolute_symbol [[META0]]
+; CHECK: @llvm.amdgcn.kernel.kernel_lds.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_lds.lds.t poison, align 2, !absolute_symbol [[META0]]
+; CHECK: @llvm.amdgcn.kernel.kernel_lds_recursion.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_lds_recursion.lds.t poison, align 2, !absolute_symbol [[META0]]
+; CHECK: @llvm.amdgcn.lds.offset.table = internal addrspace(4) constant [3 x [2 x i32]] [[2 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k0_f0.lds to i32), i32 poison], [2 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k1_f0.lds to i32), i32 ptrtoint (ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1_f0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1_f0.lds, i32 0, i32 1) to i32)], [2 x i32] [i32 poison, i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds to i32)]]
;.
define internal void @lds_use_through_indirect() {
; CHECK-LABEL: define internal void @lds_use_through_indirect(
@@ -105,7 +105,7 @@ define internal void @f0_transitive() {
define amdgpu_kernel void @k0_f0() {
; CHECK-LABEL: define amdgpu_kernel void @k0_f0(
-; CHECK-SAME: ) #[[ATTR2:[0-9]+]] !llvm.amdgcn.lds.kernel.id !2 {
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_f0.lds) ]
; CHECK-NEXT: call void @f0_transitive()
; CHECK-NEXT: ret void
@@ -116,8 +116,8 @@ define amdgpu_kernel void @k0_f0() {
define amdgpu_kernel void @k1_f0() {
; CHECK-LABEL: define amdgpu_kernel void @k1_f0(
-; CHECK-SAME: ) #[[ATTR3:[0-9]+]] !llvm.amdgcn.lds.kernel.id !3 {
-; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1_f0.lds) ], !alias.scope !4, !noalias !7
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1_f0.lds) ], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
; CHECK-NEXT: call void @f0_transitive()
; CHECK-NEXT: [[FPTR:%.*]] = load volatile ptr, ptr addrspace(1) null, align 8
; CHECK-NEXT: call void [[FPTR]]()
@@ -168,7 +168,7 @@ define internal i16 @mutual_recursion_0(i16 %arg) {
define internal void @mutual_recursion_1(i16 %arg) {
; CHECK-LABEL: define internal void @mutual_recursion_1(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 inreg [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @mutual_recursion_0(i16 [[ARG]])
; CHECK-NEXT: ret void
;
@@ -178,7 +178,7 @@ define internal void @mutual_recursion_1(i16 %arg) {
define amdgpu_kernel void @kernel_lds_recursion() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion(
-; CHECK-SAME: ) #[[ATTR2]] !llvm.amdgcn.lds.kernel.id !9 {
+; CHECK-SAME: ) #[[ATTR2]] !llvm.amdgcn.lds.kernel.id [[META9:![0-9]+]] {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ]
; CHECK-NEXT: call void @mutual_recursion_0(i16 0)
; CHECK-NEXT: ret void
@@ -199,15 +199,16 @@ define amdgpu_kernel void @kernel_lds_recursion() {
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
-; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 1}
-; CHECK: [[META1:![0-9]+]] = !{i32 0}
-; CHECK: [[META2:![0-9]+]] = !{i32 1}
-; CHECK: [[META3:![0-9]+]] = !{!5}
-; CHECK: [[META4:![0-9]+]] = distinct !{!5, !6}
-; CHECK: [[META5:![0-9]+]] = distinct !{!6}
-; CHECK: [[META6:![0-9]+]] = !{!8}
-; CHECK: [[META7:![0-9]+]] = distinct !{!8, !6}
-; CHECK: [[META8:![0-9]+]] = !{i32 2}
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
+; CHECK: [[META2]] = !{i32 0}
+; CHECK: [[META3]] = !{i32 1}
+; CHECK: [[META4]] = !{[[META5:![0-9]+]]}
+; CHECK: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]}
+; CHECK: [[META6]] = distinct !{[[META6]]}
+; CHECK: [[META7]] = !{[[META8:![0-9]+]]}
+; CHECK: [[META8]] = distinct !{[[META8]], [[META6]]}
+; CHECK: [[META9]] = !{i32 2}
;.
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; TABLE: {{.*}}
|
inreg
attribute in AMDGPUAttributor
inreg
attribute in AMDGPUAttributor
4e82ccd
to
06b51c2
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks okay to me, with a few small fixes. But I am not familiar with all the implications of inreg
. Please wait for approval from @arsenm.
06b51c2
to
7b2599d
Compare
f9f02f0
to
b75e7b0
Compare
b75e7b0
to
e962082
Compare
e962082
to
921a353
Compare
921a353
to
5cbcd58
Compare
inreg argument generation is busted, and this cannot land until that is fixed. e.g. https://godbolt.org/z/joeTjKbf7 is clobbering the SRD |
This PR should be on hold until #113782 is resolved. |
5c095ae
to
4eb932c
Compare
4eb932c
to
1c64e7c
Compare
1c64e7c
to
0c8ccc7
Compare
0c8ccc7
to
d102046
Compare
d102046
to
357582f
Compare
Value *NewV = Builder.CreateIntrinsic(V->getType(), | ||
Intrinsic::amdgcn_readfirstlane, {V}); | ||
CB->setArgOperand(ArgNo, NewV); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you directly do this in the manifest of the attribute? Ideally this would only do this for cases where this pass introduced the inreg argument and leave existing cases alone
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ideally yes, but it is not feasible at the moment. Long story short, it could cause to register multiple updates/replacement of a single IRP in the manifest map, which is currently not supported.
357582f
to
d0efab1
Compare
// We don't need readfirstvalue for a global value. | ||
if (isa<GlobalValue>(CSArg)) | ||
continue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any constant, or really any isTriviallyUniform
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any constant
Not necessarily. Address space cast can diverge in the future (the code has not been upstream yet I think).
// We don't directly emit readfirstlane here because it will cause multiple | ||
// replacements of a single use in the manifest map, which is not supported | ||
// at this moment. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't really understand the problem
if (WorkList.empty()) | ||
return Changed; | ||
|
||
for (auto &[CB, ArgNo] : WorkList) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for (auto &[CB, ArgNo] : WorkList) { | |
for (auto [CB, ArgNo] : WorkList) { |
Value *NewV = Builder.CreateIntrinsic(V->getType(), | ||
Intrinsic::amdgcn_readfirstlane, {V}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the function uses convergence tokens it should add the convergencectrl bundle
// Add both inreg and "uniform" attribute to the argument. We will emit a | ||
// readfirstlane at each call site for inreg uniform argument, and the | ||
// "uniform" attribute will be removed later. | ||
LLVMContext &Ctx = Arg->getContext(); | ||
return A.manifestAttrs(getIRPosition(), | ||
{Attribute::get(Ctx, Attribute::InReg), | ||
Attribute::get(Ctx, "uniform")}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"uniform"
is a hack on a hack, and is potentially unsafe on later code motion of the callsite. This comment doesn't explain why you would want to insert the readfirstlane in the first place, which is another giant hack.
Once again, I think it would be easiest to restrict this to the trivially uniform case for now. Extending this to arbitrary uniform analysis and the readfirstlanes should be a second step. The trivially uniform case is the most important case, and every problem is going to be in these other cases
This patch introduces
AAAMDGPUInreg
that can inferinreg
function argument attribute. The idea is, for a function argument, if the corresponding call site arguments are uniform, we can mark it asinreg
thus pass it via SGPR.In addition, this AA is also able to propagate the
inreg
attribute if feasible.