32
32
#include " llvm/Analysis/CaptureTracking.h"
33
33
#include " llvm/Analysis/InstSimplifyFolder.h"
34
34
#include " llvm/Analysis/InstructionSimplify.h"
35
+ #include " llvm/Analysis/LoopInfo.h"
35
36
#include " llvm/Analysis/ValueTracking.h"
36
37
#include " llvm/CodeGen/TargetPassConfig.h"
37
38
#include " llvm/IR/IRBuilder.h"
38
39
#include " llvm/IR/IntrinsicInst.h"
39
40
#include " llvm/IR/IntrinsicsAMDGPU.h"
40
41
#include " llvm/IR/IntrinsicsR600.h"
41
42
#include " llvm/IR/PatternMatch.h"
43
+ #include " llvm/InitializePasses.h"
42
44
#include " llvm/Pass.h"
43
45
#include " llvm/Target/TargetMachine.h"
44
46
#include " llvm/Transforms/Utils/SSAUpdater.h"
@@ -64,10 +66,17 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
64
66
cl::desc (" Maximum byte size to consider promote alloca to vector" ),
65
67
cl::init(0 ));
66
68
69
+ static cl::opt<unsigned >
70
+ LoopUserWeight (" promote-alloca-vector-loop-user-weight" ,
71
+ cl::desc (" The bonus weight of users of allocas within loop "
72
+ " when sorting profitable allocas" ),
73
+ cl::init(4 ));
74
+
67
75
// Shared implementation which can do both promotion to vector and to LDS.
68
76
class AMDGPUPromoteAllocaImpl {
69
77
private:
70
78
const TargetMachine &TM;
79
+ LoopInfo &LI;
71
80
Module *Mod = nullptr ;
72
81
const DataLayout *DL = nullptr ;
73
82
@@ -101,8 +110,11 @@ class AMDGPUPromoteAllocaImpl {
101
110
bool tryPromoteAllocaToVector (AllocaInst &I);
102
111
bool tryPromoteAllocaToLDS (AllocaInst &I, bool SufficientLDS);
103
112
113
+ void sortAllocasToPromote (SmallVectorImpl<AllocaInst *> &Allocas);
114
+
104
115
public:
105
- AMDGPUPromoteAllocaImpl (TargetMachine &TM) : TM(TM) {
116
+ AMDGPUPromoteAllocaImpl (TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
117
+
106
118
const Triple &TT = TM.getTargetTriple ();
107
119
IsAMDGCN = TT.getArch () == Triple::amdgcn;
108
120
IsAMDHSA = TT.getOS () == Triple::AMDHSA;
@@ -122,7 +134,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
122
134
if (skipFunction (F))
123
135
return false ;
124
136
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
125
- return AMDGPUPromoteAllocaImpl (TPC->getTM <TargetMachine>())
137
+ return AMDGPUPromoteAllocaImpl (
138
+ TPC->getTM <TargetMachine>(),
139
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo ())
126
140
.run (F, /* PromoteToLDS*/ true );
127
141
return false ;
128
142
}
@@ -131,6 +145,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
131
145
132
146
void getAnalysisUsage (AnalysisUsage &AU) const override {
133
147
AU.setPreservesCFG ();
148
+ AU.addRequired <LoopInfoWrapperPass>();
134
149
FunctionPass::getAnalysisUsage (AU);
135
150
}
136
151
};
@@ -145,7 +160,9 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
145
160
if (skipFunction (F))
146
161
return false ;
147
162
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
148
- return AMDGPUPromoteAllocaImpl (TPC->getTM <TargetMachine>())
163
+ return AMDGPUPromoteAllocaImpl (
164
+ TPC->getTM <TargetMachine>(),
165
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo ())
149
166
.run (F, /* PromoteToLDS*/ false );
150
167
return false ;
151
168
}
@@ -156,6 +173,7 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
156
173
157
174
void getAnalysisUsage (AnalysisUsage &AU) const override {
158
175
AU.setPreservesCFG ();
176
+ AU.addRequired <LoopInfoWrapperPass>();
159
177
FunctionPass::getAnalysisUsage (AU);
160
178
}
161
179
};
@@ -186,18 +204,23 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
186
204
// Move LDS uses from functions to kernels before promote alloca for accurate
187
205
// estimation of LDS available
188
206
INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
207
+ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
189
208
INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
190
209
" AMDGPU promote alloca to vector or LDS" , false , false )
191
210
192
- INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE " -to-vector" ,
193
- " AMDGPU promote alloca to vector" , false , false )
211
+ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE " -to-vector" ,
212
+ " AMDGPU promote alloca to vector" , false , false )
213
+ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
214
+ INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE " -to-vector" ,
215
+ " AMDGPU promote alloca to vector" , false , false )
194
216
195
217
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
196
218
char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
197
219
198
220
PreservedAnalyses AMDGPUPromoteAllocaPass::run (Function &F,
199
221
FunctionAnalysisManager &AM) {
200
- bool Changed = AMDGPUPromoteAllocaImpl (TM).run (F, /* PromoteToLDS*/ true );
222
+ auto &LI = AM.getResult <LoopAnalysis>(F);
223
+ bool Changed = AMDGPUPromoteAllocaImpl (TM, LI).run (F, /* PromoteToLDS=*/ true );
201
224
if (Changed) {
202
225
PreservedAnalyses PA;
203
226
PA.preserveSet <CFGAnalyses>();
@@ -208,7 +231,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
208
231
209
232
PreservedAnalyses
210
233
AMDGPUPromoteAllocaToVectorPass::run (Function &F, FunctionAnalysisManager &AM) {
211
- bool Changed = AMDGPUPromoteAllocaImpl (TM).run (F, /* PromoteToLDS*/ false );
234
+ auto &LI = AM.getResult <LoopAnalysis>(F);
235
+ bool Changed = AMDGPUPromoteAllocaImpl (TM, LI).run (F, /* PromoteToLDS=*/ false );
212
236
if (Changed) {
213
237
PreservedAnalyses PA;
214
238
PA.preserveSet <CFGAnalyses>();
@@ -225,6 +249,55 @@ FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
225
249
return new AMDGPUPromoteAllocaToVector ();
226
250
}
227
251
252
+ static void collectAllocaUses (AllocaInst &Alloca,
253
+ SmallVectorImpl<Use *> &Uses) {
254
+ SmallVector<Instruction *, 4 > WorkList ({&Alloca});
255
+ while (!WorkList.empty ()) {
256
+ auto *Cur = WorkList.pop_back_val ();
257
+ for (auto &U : Cur->uses ()) {
258
+ Uses.push_back (&U);
259
+
260
+ if (isa<GetElementPtrInst>(U.getUser ()))
261
+ WorkList.push_back (cast<Instruction>(U.getUser ()));
262
+ }
263
+ }
264
+ }
265
+
266
+ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote (
267
+ SmallVectorImpl<AllocaInst *> &Allocas) {
268
+ DenseMap<AllocaInst *, unsigned > Scores;
269
+
270
+ for (auto *Alloca : Allocas) {
271
+ LLVM_DEBUG (dbgs () << " Scoring: " << *Alloca << " \n " );
272
+ unsigned &Score = Scores[Alloca];
273
+ // Increment score by one for each user + a bonus for users within loops.
274
+ SmallVector<Use *, 8 > Uses;
275
+ collectAllocaUses (*Alloca, Uses);
276
+ for (auto *U : Uses) {
277
+ Instruction *Inst = cast<Instruction>(U->getUser ());
278
+ if (isa<GetElementPtrInst>(Inst))
279
+ continue ;
280
+ unsigned UserScore =
281
+ 1 + (LoopUserWeight * LI.getLoopDepth (Inst->getParent ()));
282
+ LLVM_DEBUG (dbgs () << " [+" << UserScore << " ]:\t " << *Inst << " \n " );
283
+ Score += UserScore;
284
+ }
285
+ LLVM_DEBUG (dbgs () << " => Final Score:" << Score << " \n " );
286
+ }
287
+
288
+ stable_sort (Allocas, [&](AllocaInst *A, AllocaInst *B) {
289
+ return Scores.at (A) > Scores.at (B);
290
+ });
291
+
292
+ // clang-format off
293
+ LLVM_DEBUG (
294
+ dbgs () << " Sorted Worklist:\n " ;
295
+ for (auto *A: Allocas)
296
+ dbgs () << " " << *A << " \n " ;
297
+ );
298
+ // clang-format on
299
+ }
300
+
228
301
bool AMDGPUPromoteAllocaImpl::run (Function &F, bool PromoteToLDS) {
229
302
Mod = F.getParent ();
230
303
DL = &Mod->getDataLayout ();
@@ -237,6 +310,13 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
237
310
238
311
bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem (F) : false ;
239
312
313
+ // Use up to 1/4 of available register budget for vectorization.
314
+ // FIXME: Increase the limit for whole function budgets? Perhaps x2?
315
+ unsigned VectorizationBudget =
316
+ (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
317
+ : (MaxVGPRs * 32 )) /
318
+ 4 ;
319
+
240
320
SmallVector<AllocaInst *, 16 > Allocas;
241
321
for (Instruction &I : F.getEntryBlock ()) {
242
322
if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
@@ -248,11 +328,27 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
248
328
}
249
329
}
250
330
331
+ sortAllocasToPromote (Allocas);
332
+
251
333
bool Changed = false ;
252
334
for (AllocaInst *AI : Allocas) {
253
- if (tryPromoteAllocaToVector (*AI))
335
+ const unsigned AllocaCost = DL->getTypeSizeInBits (AI->getAllocatedType ());
336
+ if (AllocaCost > VectorizationBudget) {
337
+ LLVM_DEBUG (dbgs () << " Alloca too big for vectorization: " << *AI
338
+ << " \n " );
339
+ return false ;
340
+ }
341
+
342
+ if (tryPromoteAllocaToVector (*AI)) {
254
343
Changed = true ;
255
- else if (PromoteToLDS && tryPromoteAllocaToLDS (*AI, SufficientLDS))
344
+ assert ((VectorizationBudget - AllocaCost) < VectorizationBudget &&
345
+ " Underflow!" );
346
+ VectorizationBudget -= AllocaCost;
347
+ LLVM_DEBUG (dbgs () << " Remaining vectorization budget:"
348
+ << VectorizationBudget << " \n " );
349
+ if (VectorizationBudget == 0 )
350
+ break ;
351
+ } else if (PromoteToLDS && tryPromoteAllocaToLDS (*AI, SufficientLDS))
256
352
Changed = true ;
257
353
}
258
354
@@ -641,16 +737,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
641
737
ArrayTy->getNumElements ());
642
738
}
643
739
644
- // Use up to 1/4 of available register budget for vectorization.
645
- unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
646
- : (MaxVGPRs * 32 );
647
-
648
- if (DL->getTypeSizeInBits (AllocaTy) * 4 > Limit) {
649
- LLVM_DEBUG (dbgs () << " Alloca too big for vectorization with " << MaxVGPRs
650
- << " registers available\n " );
651
- return false ;
652
- }
653
-
654
740
// FIXME: There is no reason why we can't support larger arrays, we
655
741
// are just being conservative for now.
656
742
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
@@ -671,7 +757,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
671
757
SmallVector<Instruction *> WorkList;
672
758
SmallVector<Instruction *> UsersToRemove;
673
759
SmallVector<Instruction *> DeferredInsts;
674
- SmallVector<Use *, 8 > Uses;
675
760
DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
676
761
677
762
const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
@@ -680,15 +765,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
680
765
return false ;
681
766
};
682
767
683
- for ( Use &U : Alloca. uses ())
684
- Uses. push_back (&U );
768
+ SmallVector< Use *, 8 > Uses;
769
+ collectAllocaUses (Alloca, Uses);
685
770
686
771
LLVM_DEBUG (dbgs () << " Attempting promotion to: " << *VectorTy << " \n " );
687
772
688
773
Type *VecEltTy = VectorTy->getElementType ();
689
774
unsigned ElementSize = DL->getTypeSizeInBits (VecEltTy) / 8 ;
690
- while (!Uses.empty ()) {
691
- Use *U = Uses.pop_back_val ();
775
+ for (auto *U : Uses) {
692
776
Instruction *Inst = cast<Instruction>(U->getUser ());
693
777
694
778
if (Value *Ptr = getLoadStorePointerOperand (Inst)) {
@@ -732,8 +816,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
732
816
return RejectUser (Inst, " cannot compute vector index for GEP" );
733
817
734
818
GEPVectorIdx[GEP] = Index;
735
- for (Use &U : Inst->uses ())
736
- Uses.push_back (&U);
737
819
UsersToRemove.push_back (Inst);
738
820
continue ;
739
821
}
0 commit comments