Skip to content

Commit 061730b

Browse files
Pierre-vhchencha3
authored andcommitted
[AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector (llvm#84735)
Update PromoteAllocaToVector so it considers the whole function before promoting allocas. Allocas are scored & sorted so the highest value ones are seen first. The budget is now per function instead of per alloca. Passed internal performance testing.
1 parent 3c0002c commit 061730b

File tree

3 files changed

+182
-31
lines changed

3 files changed

+182
-31
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 108 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,15 @@
3232
#include "llvm/Analysis/CaptureTracking.h"
3333
#include "llvm/Analysis/InstSimplifyFolder.h"
3434
#include "llvm/Analysis/InstructionSimplify.h"
35+
#include "llvm/Analysis/LoopInfo.h"
3536
#include "llvm/Analysis/ValueTracking.h"
3637
#include "llvm/CodeGen/TargetPassConfig.h"
3738
#include "llvm/IR/IRBuilder.h"
3839
#include "llvm/IR/IntrinsicInst.h"
3940
#include "llvm/IR/IntrinsicsAMDGPU.h"
4041
#include "llvm/IR/IntrinsicsR600.h"
4142
#include "llvm/IR/PatternMatch.h"
43+
#include "llvm/InitializePasses.h"
4244
#include "llvm/Pass.h"
4345
#include "llvm/Target/TargetMachine.h"
4446
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -64,10 +66,17 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
6466
cl::desc("Maximum byte size to consider promote alloca to vector"),
6567
cl::init(0));
6668

69+
static cl::opt<unsigned>
70+
LoopUserWeight("promote-alloca-vector-loop-user-weight",
71+
cl::desc("The bonus weight of users of allocas within loop "
72+
"when sorting profitable allocas"),
73+
cl::init(4));
74+
6775
// Shared implementation which can do both promotion to vector and to LDS.
6876
class AMDGPUPromoteAllocaImpl {
6977
private:
7078
const TargetMachine &TM;
79+
LoopInfo &LI;
7180
Module *Mod = nullptr;
7281
const DataLayout *DL = nullptr;
7382

@@ -101,8 +110,11 @@ class AMDGPUPromoteAllocaImpl {
101110
bool tryPromoteAllocaToVector(AllocaInst &I);
102111
bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
103112

113+
void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
114+
104115
public:
105-
AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
116+
AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
117+
106118
const Triple &TT = TM.getTargetTriple();
107119
IsAMDGCN = TT.getArch() == Triple::amdgcn;
108120
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
@@ -122,7 +134,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
122134
if (skipFunction(F))
123135
return false;
124136
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
125-
return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
137+
return AMDGPUPromoteAllocaImpl(
138+
TPC->getTM<TargetMachine>(),
139+
getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
126140
.run(F, /*PromoteToLDS*/ true);
127141
return false;
128142
}
@@ -131,6 +145,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
131145

132146
void getAnalysisUsage(AnalysisUsage &AU) const override {
133147
AU.setPreservesCFG();
148+
AU.addRequired<LoopInfoWrapperPass>();
134149
FunctionPass::getAnalysisUsage(AU);
135150
}
136151
};
@@ -145,7 +160,9 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
145160
if (skipFunction(F))
146161
return false;
147162
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
148-
return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
163+
return AMDGPUPromoteAllocaImpl(
164+
TPC->getTM<TargetMachine>(),
165+
getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
149166
.run(F, /*PromoteToLDS*/ false);
150167
return false;
151168
}
@@ -156,6 +173,7 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
156173

157174
void getAnalysisUsage(AnalysisUsage &AU) const override {
158175
AU.setPreservesCFG();
176+
AU.addRequired<LoopInfoWrapperPass>();
159177
FunctionPass::getAnalysisUsage(AU);
160178
}
161179
};
@@ -186,18 +204,23 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
186204
// Move LDS uses from functions to kernels before promote alloca for accurate
187205
// estimation of LDS available
188206
INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
207+
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
189208
INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
190209
"AMDGPU promote alloca to vector or LDS", false, false)
191210

192-
INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
193-
"AMDGPU promote alloca to vector", false, false)
211+
INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
212+
"AMDGPU promote alloca to vector", false, false)
213+
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
214+
INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
215+
"AMDGPU promote alloca to vector", false, false)
194216

195217
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
196218
char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
197219

198220
PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
199221
FunctionAnalysisManager &AM) {
200-
bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
222+
auto &LI = AM.getResult<LoopAnalysis>(F);
223+
bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
201224
if (Changed) {
202225
PreservedAnalyses PA;
203226
PA.preserveSet<CFGAnalyses>();
@@ -208,7 +231,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
208231

209232
PreservedAnalyses
210233
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
211-
bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
234+
auto &LI = AM.getResult<LoopAnalysis>(F);
235+
bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
212236
if (Changed) {
213237
PreservedAnalyses PA;
214238
PA.preserveSet<CFGAnalyses>();
@@ -225,6 +249,55 @@ FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
225249
return new AMDGPUPromoteAllocaToVector();
226250
}
227251

252+
static void collectAllocaUses(AllocaInst &Alloca,
253+
SmallVectorImpl<Use *> &Uses) {
254+
SmallVector<Instruction *, 4> WorkList({&Alloca});
255+
while (!WorkList.empty()) {
256+
auto *Cur = WorkList.pop_back_val();
257+
for (auto &U : Cur->uses()) {
258+
Uses.push_back(&U);
259+
260+
if (isa<GetElementPtrInst>(U.getUser()))
261+
WorkList.push_back(cast<Instruction>(U.getUser()));
262+
}
263+
}
264+
}
265+
266+
void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
267+
SmallVectorImpl<AllocaInst *> &Allocas) {
268+
DenseMap<AllocaInst *, unsigned> Scores;
269+
270+
for (auto *Alloca : Allocas) {
271+
LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
272+
unsigned &Score = Scores[Alloca];
273+
// Increment score by one for each user + a bonus for users within loops.
274+
SmallVector<Use *, 8> Uses;
275+
collectAllocaUses(*Alloca, Uses);
276+
for (auto *U : Uses) {
277+
Instruction *Inst = cast<Instruction>(U->getUser());
278+
if (isa<GetElementPtrInst>(Inst))
279+
continue;
280+
unsigned UserScore =
281+
1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
282+
LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
283+
Score += UserScore;
284+
}
285+
LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
286+
}
287+
288+
stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
289+
return Scores.at(A) > Scores.at(B);
290+
});
291+
292+
// clang-format off
293+
LLVM_DEBUG(
294+
dbgs() << "Sorted Worklist:\n";
295+
for (auto *A: Allocas)
296+
dbgs() << " " << *A << "\n";
297+
);
298+
// clang-format on
299+
}
300+
228301
bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
229302
Mod = F.getParent();
230303
DL = &Mod->getDataLayout();
@@ -237,6 +310,13 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
237310

238311
bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
239312

313+
// Use up to 1/4 of available register budget for vectorization.
314+
// FIXME: Increase the limit for whole function budgets? Perhaps x2?
315+
unsigned VectorizationBudget =
316+
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
317+
: (MaxVGPRs * 32)) /
318+
4;
319+
240320
SmallVector<AllocaInst *, 16> Allocas;
241321
for (Instruction &I : F.getEntryBlock()) {
242322
if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
@@ -248,11 +328,27 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
248328
}
249329
}
250330

331+
sortAllocasToPromote(Allocas);
332+
251333
bool Changed = false;
252334
for (AllocaInst *AI : Allocas) {
253-
if (tryPromoteAllocaToVector(*AI))
335+
const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
336+
if (AllocaCost > VectorizationBudget) {
337+
LLVM_DEBUG(dbgs() << " Alloca too big for vectorization: " << *AI
338+
<< "\n");
339+
return false;
340+
}
341+
342+
if (tryPromoteAllocaToVector(*AI)) {
254343
Changed = true;
255-
else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
344+
assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
345+
"Underflow!");
346+
VectorizationBudget -= AllocaCost;
347+
LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
348+
<< VectorizationBudget << "\n");
349+
if (VectorizationBudget == 0)
350+
break;
351+
} else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
256352
Changed = true;
257353
}
258354

@@ -641,16 +737,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
641737
ArrayTy->getNumElements());
642738
}
643739

644-
// Use up to 1/4 of available register budget for vectorization.
645-
unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
646-
: (MaxVGPRs * 32);
647-
648-
if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
649-
LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " << MaxVGPRs
650-
<< " registers available\n");
651-
return false;
652-
}
653-
654740
// FIXME: There is no reason why we can't support larger arrays, we
655741
// are just being conservative for now.
656742
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
@@ -671,7 +757,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
671757
SmallVector<Instruction *> WorkList;
672758
SmallVector<Instruction *> UsersToRemove;
673759
SmallVector<Instruction *> DeferredInsts;
674-
SmallVector<Use *, 8> Uses;
675760
DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
676761

677762
const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
@@ -680,15 +765,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
680765
return false;
681766
};
682767

683-
for (Use &U : Alloca.uses())
684-
Uses.push_back(&U);
768+
SmallVector<Use *, 8> Uses;
769+
collectAllocaUses(Alloca, Uses);
685770

686771
LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
687772

688773
Type *VecEltTy = VectorTy->getElementType();
689774
unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
690-
while (!Uses.empty()) {
691-
Use *U = Uses.pop_back_val();
775+
for (auto *U : Uses) {
692776
Instruction *Inst = cast<Instruction>(U->getUser());
693777

694778
if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
@@ -732,8 +816,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
732816
return RejectUser(Inst, "cannot compute vector index for GEP");
733817

734818
GEPVectorIdx[GEP] = Index;
735-
for (Use &U : Inst->uses())
736-
Uses.push_back(&U);
737819
UsersToRemove.push_back(Inst);
738820
continue;
739821
}

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -195,13 +195,13 @@
195195
; GCN-O1-NEXT: Uniformity Analysis
196196
; GCN-O1-NEXT: AMDGPU atomic optimizations
197197
; GCN-O1-NEXT: Expand Atomic instructions
198-
; GCN-O1-NEXT: AMDGPU Promote Alloca
199198
; GCN-O1-NEXT: Dominator Tree Construction
199+
; GCN-O1-NEXT: Natural Loop Information
200+
; GCN-O1-NEXT: AMDGPU Promote Alloca
200201
; GCN-O1-NEXT: Cycle Info Analysis
201202
; GCN-O1-NEXT: Uniformity Analysis
202203
; GCN-O1-NEXT: AMDGPU IR optimizations
203204
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
204-
; GCN-O1-NEXT: Natural Loop Information
205205
; GCN-O1-NEXT: Canonicalize natural loops
206206
; GCN-O1-NEXT: Scalar Evolution Analysis
207207
; GCN-O1-NEXT: Loop Pass Manager
@@ -470,9 +470,9 @@
470470
; GCN-O1-OPTS-NEXT: Uniformity Analysis
471471
; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations
472472
; GCN-O1-OPTS-NEXT: Expand Atomic instructions
473-
; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
474473
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
475474
; GCN-O1-OPTS-NEXT: Natural Loop Information
475+
; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
476476
; GCN-O1-OPTS-NEXT: Canonicalize natural loops
477477
; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis
478478
; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis
@@ -775,9 +775,9 @@
775775
; GCN-O2-NEXT: Uniformity Analysis
776776
; GCN-O2-NEXT: AMDGPU atomic optimizations
777777
; GCN-O2-NEXT: Expand Atomic instructions
778-
; GCN-O2-NEXT: AMDGPU Promote Alloca
779778
; GCN-O2-NEXT: Dominator Tree Construction
780779
; GCN-O2-NEXT: Natural Loop Information
780+
; GCN-O2-NEXT: AMDGPU Promote Alloca
781781
; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
782782
; GCN-O2-NEXT: Scalar Evolution Analysis
783783
; GCN-O2-NEXT: Straight line strength reduction
@@ -1084,9 +1084,9 @@
10841084
; GCN-O3-NEXT: Uniformity Analysis
10851085
; GCN-O3-NEXT: AMDGPU atomic optimizations
10861086
; GCN-O3-NEXT: Expand Atomic instructions
1087-
; GCN-O3-NEXT: AMDGPU Promote Alloca
10881087
; GCN-O3-NEXT: Dominator Tree Construction
10891088
; GCN-O3-NEXT: Natural Loop Information
1089+
; GCN-O3-NEXT: AMDGPU Promote Alloca
10901090
; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
10911091
; GCN-O3-NEXT: Scalar Evolution Analysis
10921092
; GCN-O3-NEXT: Straight line strength reduction
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
2+
; REQUIRES: asserts
3+
4+
; CHECK: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
5+
; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %simpleuser, align 4
6+
; CHECK-NEXT: => Final Score:1
7+
; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5)
8+
; CHECK-NEXT: [+1]: store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4
9+
; CHECK-NEXT: [+1]: %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
10+
; CHECK-NEXT: [+1]: store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4
11+
; CHECK-NEXT: [+1]: %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
12+
; CHECK-NEXT: => Final Score:4
13+
; CHECK-NEXT: Sorted Worklist:
14+
; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5)
15+
; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
16+
define amdgpu_kernel void @simple_users_scores() #0 {
17+
entry:
18+
; should get a score of 1
19+
%simpleuser = alloca [4 x i64], align 4, addrspace(5)
20+
; should get a score of 4
21+
%manyusers = alloca [4 x i64], align 4, addrspace(5)
22+
23+
store i32 42, ptr addrspace(5) %simpleuser
24+
25+
%manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
26+
%v0 = load i8, ptr addrspace(5) %manyusers.1
27+
%v0.ext = zext i8 %v0 to i32
28+
store i32 %v0.ext, ptr addrspace(5) %manyusers.1
29+
30+
%manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
31+
%v1 = load i8, ptr addrspace(5) %manyusers.2
32+
%v1.ext = zext i8 %v0 to i32
33+
store i32 %v1.ext, ptr addrspace(5) %manyusers.2
34+
35+
ret void
36+
}
37+
38+
; CHECK: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5)
39+
; CHECK-NEXT: [+5]: store i32 32, ptr addrspace(5) %stack, align 4
40+
; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %stack, align 4
41+
; CHECK-NEXT: [+9]: store i32 32, ptr addrspace(5) %stack.1, align 4
42+
; CHECK-NEXT: [+5]: %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1
43+
; CHECK-NEXT: [+1]: store i32 64, ptr addrspace(5) %stack.2, align 4
44+
; CHECK-NEXT: [+9]: %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1
45+
; CHECK-NEXT: => Final Score:30
46+
define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
47+
entry:
48+
; should get a score of 1
49+
%stack = alloca [4 x i64], align 4, addrspace(5)
50+
%stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
51+
%stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8
52+
53+
store i32 42, ptr addrspace(5) %stack
54+
br label %loop.outer
55+
56+
loop.outer:
57+
store i32 32, ptr addrspace(5) %stack
58+
%outer.cmp = load i1, ptr addrspace(5) %stack.1
59+
br label %loop.inner
60+
61+
loop.inner:
62+
store i32 32, ptr addrspace(5) %stack.1
63+
%inner.cmp = load i1, ptr addrspace(5) %stack.2
64+
br i1 %inner.cmp, label %loop.inner, label %loop.outer
65+
66+
exit:
67+
store i32 64, ptr addrspace(5) %stack.2
68+
ret void
69+
}

0 commit comments

Comments
 (0)