Skip to content

Commit ab90ae6

Browse files
committed
[AMDGPU] Switch AnnotateUniformValues to MemorySSA
This shall speedup compilation and also remove threshold limitations used by memory dependency analysis. It also seem to fix the bug in the coalescer_remat.ll where an SMRD load was used in presence of a potentially clobbering store. Fixes: SWDEV-272132 Differential Revision: https://reviews.llvm.org/D101962
1 parent 6617a5a commit ab90ae6

File tree

5 files changed

+24
-65
lines changed

5 files changed

+24
-65
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp

Lines changed: 8 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,8 @@
1414

1515
#include "AMDGPU.h"
1616
#include "Utils/AMDGPUBaseInfo.h"
17-
#include "llvm/ADT/DepthFirstIterator.h"
18-
#include "llvm/ADT/SetVector.h"
1917
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
20-
#include "llvm/Analysis/LoopInfo.h"
21-
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
18+
#include "llvm/Analysis/MemorySSA.h"
2219
#include "llvm/IR/InstVisitor.h"
2320
#include "llvm/InitializePasses.h"
2421

@@ -31,8 +28,7 @@ namespace {
3128
class AMDGPUAnnotateUniformValues : public FunctionPass,
3229
public InstVisitor<AMDGPUAnnotateUniformValues> {
3330
LegacyDivergenceAnalysis *DA;
34-
MemoryDependenceResults *MDR;
35-
LoopInfo *LI;
31+
MemorySSA *MSSA;
3632
DenseMap<Value*, GetElementPtrInst*> noClobberClones;
3733
bool isEntryFunc;
3834

@@ -47,8 +43,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
4743
}
4844
void getAnalysisUsage(AnalysisUsage &AU) const override {
4945
AU.addRequired<LegacyDivergenceAnalysis>();
50-
AU.addRequired<MemoryDependenceWrapperPass>();
51-
AU.addRequired<LoopInfoWrapperPass>();
46+
AU.addRequired<MemorySSAWrapperPass>();
5247
AU.setPreservesAll();
5348
}
5449

@@ -62,8 +57,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
6257
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
6358
"Add AMDGPU uniform metadata", false, false)
6459
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
65-
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
66-
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
60+
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
6761
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
6862
"Add AMDGPU uniform metadata", false, false)
6963

@@ -77,37 +71,8 @@ static void setNoClobberMetadata(Instruction *I) {
7771
}
7872

7973
bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
80-
// 1. get Loop for the Load->getparent();
81-
// 2. if it exists, collect all the BBs from the most outer
82-
// loop and check for the writes. If NOT - start DFS over all preds.
83-
// 3. Start DFS over all preds from the most outer loop header.
84-
SetVector<BasicBlock *> Checklist;
85-
BasicBlock *Start = Load->getParent();
86-
Checklist.insert(Start);
87-
const Value *Ptr = Load->getPointerOperand();
88-
const Loop *L = LI->getLoopFor(Start);
89-
if (L) {
90-
const Loop *P = L;
91-
do {
92-
L = P;
93-
P = P->getParentLoop();
94-
} while (P);
95-
Checklist.insert(L->block_begin(), L->block_end());
96-
Start = L->getHeader();
97-
}
98-
99-
Checklist.insert(idf_begin(Start), idf_end(Start));
100-
for (auto &BB : Checklist) {
101-
BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ?
102-
BasicBlock::iterator(Load) : BB->end();
103-
auto Q = MDR->getPointerDependencyFrom(
104-
MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load);
105-
if (Q.isClobber() || Q.isUnknown() ||
106-
// Store defines the load and thus clobbers it.
107-
(Q.isDef() && Q.getInst()->mayWriteToMemory()))
108-
return true;
109-
}
110-
return false;
74+
const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(Load);
75+
return !MSSA->isLiveOnEntryDef(MA);
11176
}
11277

11378
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
@@ -172,9 +137,8 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
172137
if (skipFunction(F))
173138
return false;
174139

175-
DA = &getAnalysis<LegacyDivergenceAnalysis>();
176-
MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
177-
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
140+
DA = &getAnalysis<LegacyDivergenceAnalysis>();
141+
MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
178142
isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
179143

180144
visit(F);

llvm/test/CodeGen/AMDGPU/coalescer_remat.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ declare float @llvm.fma.f32(float, float, float)
1212
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
1313
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
1414
; It's probably OK if this is slightly higher:
15-
; CHECK: ; NumVgprs: 4
16-
define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
15+
; CHECK: ; NumVgprs: 8
16+
define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* noalias %out, <4 x float> addrspace(1)* noalias %in, i32 %flag) {
1717
entry:
1818
%cmpflag = icmp eq i32 %flag, 1
1919
br i1 %cmpflag, label %loop, label %exit

llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
; GCN: flat_load_dword
55
; GCN: flat_load_dword
66
; GCN: flat_store_dword
7-
define amdgpu_kernel void @unknown_memdep_analysis(float addrspace(1)* nocapture readonly %arg, float %arg1) #0 {
7+
define void @unknown_memdep_analysis(float addrspace(1)* nocapture readonly %arg, float %arg1) #0 {
88
bb:
99
%tmp53 = load float, float addrspace(1)* undef, align 4
1010
%tmp54 = getelementptr inbounds float, float addrspace(1)* %arg, i32 31

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,8 @@
9595
; GCN-O0-NEXT: Code sinking
9696
; GCN-O0-NEXT: Post-Dominator Tree Construction
9797
; GCN-O0-NEXT: Legacy Divergence Analysis
98-
; GCN-O0-NEXT: Phi Values Analysis
9998
; GCN-O0-NEXT: Function Alias Analysis Results
100-
; GCN-O0-NEXT: Memory Dependence Analysis
99+
; GCN-O0-NEXT: Memory SSA
101100
; GCN-O0-NEXT: AMDGPU Annotate Uniform Values
102101
; GCN-O0-NEXT: SI annotate control flow
103102
; GCN-O0-NEXT: Natural Loop Information
@@ -275,9 +274,8 @@
275274
; GCN-O1-NEXT: Code sinking
276275
; GCN-O1-NEXT: Post-Dominator Tree Construction
277276
; GCN-O1-NEXT: Legacy Divergence Analysis
278-
; GCN-O1-NEXT: Phi Values Analysis
279277
; GCN-O1-NEXT: Function Alias Analysis Results
280-
; GCN-O1-NEXT: Memory Dependence Analysis
278+
; GCN-O1-NEXT: Memory SSA
281279
; GCN-O1-NEXT: AMDGPU Annotate Uniform Values
282280
; GCN-O1-NEXT: SI annotate control flow
283281
; GCN-O1-NEXT: Natural Loop Information
@@ -550,9 +548,8 @@
550548
; GCN-O1-OPTS-NEXT: Code sinking
551549
; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction
552550
; GCN-O1-OPTS-NEXT: Legacy Divergence Analysis
553-
; GCN-O1-OPTS-NEXT: Phi Values Analysis
554551
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
555-
; GCN-O1-OPTS-NEXT: Memory Dependence Analysis
552+
; GCN-O1-OPTS-NEXT: Memory SSA
556553
; GCN-O1-OPTS-NEXT: AMDGPU Annotate Uniform Values
557554
; GCN-O1-OPTS-NEXT: SI annotate control flow
558555
; GCN-O1-OPTS-NEXT: Natural Loop Information
@@ -833,9 +830,8 @@
833830
; GCN-O2-NEXT: Code sinking
834831
; GCN-O2-NEXT: Post-Dominator Tree Construction
835832
; GCN-O2-NEXT: Legacy Divergence Analysis
836-
; GCN-O2-NEXT: Phi Values Analysis
837833
; GCN-O2-NEXT: Function Alias Analysis Results
838-
; GCN-O2-NEXT: Memory Dependence Analysis
834+
; GCN-O2-NEXT: Memory SSA
839835
; GCN-O2-NEXT: AMDGPU Annotate Uniform Values
840836
; GCN-O2-NEXT: SI annotate control flow
841837
; GCN-O2-NEXT: Natural Loop Information
@@ -1129,9 +1125,8 @@
11291125
; GCN-O3-NEXT: Code sinking
11301126
; GCN-O3-NEXT: Post-Dominator Tree Construction
11311127
; GCN-O3-NEXT: Legacy Divergence Analysis
1132-
; GCN-O3-NEXT: Phi Values Analysis
11331128
; GCN-O3-NEXT: Function Alias Analysis Results
1134-
; GCN-O3-NEXT: Memory Dependence Analysis
1129+
; GCN-O3-NEXT: Memory SSA
11351130
; GCN-O3-NEXT: AMDGPU Annotate Uniform Values
11361131
; GCN-O3-NEXT: SI annotate control flow
11371132
; GCN-O3-NEXT: Natural Loop Information

llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
21
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck %s
32

43
; FIXME: The wide loads and bundles introduce so much spilling.
5-
define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrspace(4)* %wei_ptr, float addrspace(1)* %out_ptr) {
4+
define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrspace(4)* %wei_ptr, float addrspace(1)* %out_ptr, float addrspace(1)* %in) {
65
; CHECK-LABEL: excess_soft_clause_reg_pressure:
76
; CHECK: BB0_1: ; %for.cond28.preheader
8-
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
97
; CHECK: global_load_dword
108
; CHECK-NEXT: global_load_dword
119
; CHECK-NEXT: global_load_dword
@@ -14,6 +12,7 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrs
1412
; CHECK: s_load_dwordx16
1513
; CHECK-NEXT: s_load_dwordx16
1614
; CHECK-NEXT: s_load_dwordx16
15+
; CHECK-NEXT: s_load_dwordx16
1716

1817
; CHECK: v_writelane_b32
1918
; CHECK-NEXT: v_writelane_b32
@@ -32,7 +31,6 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrs
3231
; CHECK-NEXT: v_writelane_b32
3332
; CHECK-NEXT: v_writelane_b32
3433
; CHECK-NEXT: s_load_dwordx16
35-
; CHECK-NEXT: s_load_dwordx16
3634

3735
; CHECK: v_writelane_b32
3836
; CHECK-NEXT: v_writelane_b32
@@ -50,7 +48,6 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrs
5048
; CHECK-NEXT: v_writelane_b32
5149
; CHECK-NEXT: v_writelane_b32
5250
; CHECK-NEXT: v_writelane_b32
53-
; CHECK-NEXT: s_load_dwordx16
5451

5552
; CHECK: v_readlane_b32
5653
; CHECK-NEXT: v_readlane_b32
@@ -70,8 +67,10 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrs
7067
; CHECK-NEXT: v_readlane_b32
7168

7269
; CHECK: s_load_dwordx16
73-
; CHECK-NEXT: s_load_dwordx16
74-
; CHECK-NEXT: v_readlane_b32
70+
; CHECK: s_load_dwordx16
71+
; CHECK: s_load_dwordx16
72+
73+
; CHECK: v_readlane_b32
7574
; CHECK-NEXT: v_readlane_b32
7675
; CHECK-NEXT: v_readlane_b32
7776
; CHECK-NEXT: v_readlane_b32
@@ -100,6 +99,7 @@ entry:
10099
%conv.frozen = freeze i32 %conv
101100
%div = udiv i32 %conv.frozen, 49
102101
%add.ptr22 = getelementptr inbounds float, float addrspace(4)* %wei_ptr, i64 undef
102+
%in.ptr1 = getelementptr inbounds float, float addrspace(1)* %in, i32 %i5
103103
br label %for.cond28.preheader
104104

105105
for.cond28.preheader: ; preds = %for.cond28.preheader, %entry
@@ -135,7 +135,7 @@ for.cond28.preheader: ; preds = %for.cond28.preheade
135135
%accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ]
136136
%accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ]
137137
%accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ]
138-
%i_ptr.0288 = phi float addrspace(1)* [ undef, %entry ], [ %add.ptr47.3, %for.cond28.preheader ]
138+
%i_ptr.0288 = phi float addrspace(1)* [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ]
139139
%w_ptr.0287 = phi float addrspace(4)* [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ]
140140
%ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ]
141141
%i8 = load float, float addrspace(1)* %i_ptr.0288, align 4

0 commit comments

Comments
 (0)