@@ -66,6 +66,18 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66
66
cl::desc (" Maximum byte size to consider promote alloca to vector" ),
67
67
cl::init(0 ));
68
68
69
+ static cl::opt<unsigned > PromoteAllocaToVectorMaxElements (
70
+ " amdgpu-promote-alloca-to-vector-max-elements" ,
71
+ cl::desc (" Maximum vector size (in elements) to use when promoting alloca" ),
72
+ cl::init(16 ));
73
+
74
+ // Use up to 1/4 of available register budget for vectorization.
75
+ // FIXME: Increase the limit for whole function budgets? Perhaps x2?
76
+ static cl::opt<unsigned > PromoteAllocaToVectorVGPRRatio (
77
+ " amdgpu-promote-alloca-to-vector-vgpr-ratio" ,
78
+ cl::desc (" Ratio of VGPRs to budget for promoting alloca to vectors" ),
79
+ cl::init(4 ));
80
+
69
81
static cl::opt<unsigned >
70
82
LoopUserWeight (" promote-alloca-vector-loop-user-weight" ,
71
83
cl::desc (" The bonus weight of users of allocas within loop "
@@ -310,12 +322,17 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
310
322
311
323
bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem (F) : false ;
312
324
313
- // Use up to 1/4 of available register budget for vectorization.
314
- // FIXME: Increase the limit for whole function budgets? Perhaps x2?
325
+ const unsigned VGPRRatio =
326
+ PromoteAllocaToVectorVGPRRatio.getNumOccurrences ()
327
+ ? PromoteAllocaToVectorVGPRRatio
328
+ : F.getFnAttributeAsParsedInteger (
329
+ " amdgpu-promote-alloca-to-vector-vgpr-ratio" ,
330
+ PromoteAllocaToVectorVGPRRatio);
331
+
315
332
unsigned VectorizationBudget =
316
333
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
317
334
: (MaxVGPRs * 32 )) /
318
- 4 ;
335
+ VGPRRatio ;
319
336
320
337
SmallVector<AllocaInst *, 16 > Allocas;
321
338
for (Instruction &I : F.getEntryBlock ()) {
@@ -398,7 +415,8 @@ calculateVectorIndex(Value *Ptr,
398
415
}
399
416
400
417
static Value *GEPToVectorIndex (GetElementPtrInst *GEP, AllocaInst *Alloca,
401
- Type *VecElemTy, const DataLayout &DL) {
418
+ Type *VecElemTy, const DataLayout &DL,
419
+ SmallVector<Instruction *> &NewInsts) {
402
420
// TODO: Extracting a "multiple of X" from a GEP might be a useful generic
403
421
// helper.
404
422
unsigned BW = DL.getIndexTypeSizeInBits (GEP->getType ());
@@ -412,22 +430,37 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
412
430
if (VarOffsets.size () > 1 )
413
431
return nullptr ;
414
432
415
- if (VarOffsets.size () == 1 ) {
416
- // Only handle cases where we don't need to insert extra arithmetic
417
- // instructions.
418
- const auto &VarOffset = VarOffsets.front ();
419
- if (!ConstOffset.isZero () || VarOffset.second != VecElemSize)
420
- return nullptr ;
421
- return VarOffset.first ;
422
- }
423
-
424
433
APInt Quot;
425
434
uint64_t Rem;
426
435
APInt::udivrem (ConstOffset, VecElemSize, Quot, Rem);
427
436
if (Rem != 0 )
428
437
return nullptr ;
429
438
430
- return ConstantInt::get (GEP->getContext (), Quot);
439
+ ConstantInt *ConstIndex = ConstantInt::get (GEP->getContext (), Quot);
440
+ if (VarOffsets.size () == 0 )
441
+ return ConstIndex;
442
+
443
+ IRBuilder<> Builder (GEP);
444
+
445
+ const auto &VarOffset = VarOffsets.front ();
446
+ APInt::udivrem (VarOffset.second , VecElemSize, Quot, Rem);
447
+ if (Rem != 0 || Quot.isZero ())
448
+ return nullptr ;
449
+
450
+ Value *Offset = VarOffset.first ;
451
+ if (!Quot.isOne ()) {
452
+ ConstantInt *ConstMul = ConstantInt::get (GEP->getContext (), Quot);
453
+ Offset = Builder.CreateMul (Offset, ConstMul);
454
+ if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
455
+ NewInsts.push_back (NewInst);
456
+ }
457
+ if (ConstOffset.isZero ())
458
+ return Offset;
459
+
460
+ Value *IndexAdd = Builder.CreateAdd (ConstIndex, Offset);
461
+ if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
462
+ NewInsts.push_back (NewInst);
463
+ return IndexAdd;
431
464
}
432
465
433
466
// / Promotes a single user of the alloca to a vector form.
@@ -735,23 +768,48 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
735
768
Type *AllocaTy = Alloca.getAllocatedType ();
736
769
auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
737
770
if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
738
- if (VectorType::isValidElementType (ArrayTy->getElementType ()) &&
739
- ArrayTy->getNumElements () > 0 )
740
- VectorTy = FixedVectorType::get (ArrayTy->getElementType (),
741
- ArrayTy->getNumElements ());
771
+ uint64_t NumElems = 1 ;
772
+ Type *ElemTy;
773
+ do {
774
+ NumElems *= ArrayTy->getNumElements ();
775
+ ElemTy = ArrayTy->getElementType ();
776
+ } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
777
+
778
+ // Check for array of vectors
779
+ auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
780
+ if (InnerVectorTy) {
781
+ NumElems *= InnerVectorTy->getNumElements ();
782
+ ElemTy = InnerVectorTy->getElementType ();
783
+ }
784
+
785
+ if (VectorType::isValidElementType (ElemTy) && NumElems > 0 ) {
786
+ unsigned ElementSize = DL->getTypeSizeInBits (ElemTy) / 8 ;
787
+ unsigned AllocaSize = DL->getTypeStoreSize (AllocaTy);
788
+ // Expand vector if required to match padding of inner type,
789
+ // i.e. odd size subvectors.
790
+ // Storage size of new vector must match that of alloca for correct
791
+ // behaviour of byte offsets and GEP computation.
792
+ if (NumElems * ElementSize != AllocaSize)
793
+ NumElems = AllocaSize / ElementSize;
794
+ if (NumElems > 0 && (AllocaSize % ElementSize) == 0 )
795
+ VectorTy = FixedVectorType::get (ElemTy, NumElems);
796
+ }
742
797
}
743
798
744
- // FIXME: There is no reason why we can't support larger arrays, we
745
- // are just being conservative for now.
746
- // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
747
- // equivalent. Potentially these could also be promoted but we don't currently
748
- // handle this case
749
799
if (!VectorTy) {
750
800
LLVM_DEBUG (dbgs () << " Cannot convert type to vector\n " );
751
801
return false ;
752
802
}
753
803
754
- if (VectorTy->getNumElements () > 16 || VectorTy->getNumElements () < 2 ) {
804
+ const unsigned MaxElements =
805
+ PromoteAllocaToVectorMaxElements.getNumOccurrences ()
806
+ ? PromoteAllocaToVectorMaxElements
807
+ : Alloca.getParent ()->getParent ()->getFnAttributeAsParsedInteger (
808
+ " amdgpu-promote-alloca-to-vector-max-elements" ,
809
+ PromoteAllocaToVectorMaxElements);
810
+
811
+ if (VectorTy->getNumElements () > MaxElements ||
812
+ VectorTy->getNumElements () < 2 ) {
755
813
LLVM_DEBUG (dbgs () << " " << *VectorTy
756
814
<< " has an unsupported number of elements\n " );
757
815
return false ;
@@ -761,11 +819,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
761
819
SmallVector<Instruction *> WorkList;
762
820
SmallVector<Instruction *> UsersToRemove;
763
821
SmallVector<Instruction *> DeferredInsts;
822
+ SmallVector<Instruction *> NewGEPInsts;
764
823
DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
765
824
766
825
const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
767
826
LLVM_DEBUG (dbgs () << " Cannot promote alloca to vector: " << Msg << " \n "
768
827
<< " " << *Inst << " \n " );
828
+ for (auto *Inst : reverse (NewGEPInsts))
829
+ Inst->eraseFromParent ();
769
830
return false ;
770
831
};
771
832
@@ -815,7 +876,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
815
876
if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
816
877
// If we can't compute a vector index from this GEP, then we can't
817
878
// promote this alloca to vector.
818
- Value *Index = GEPToVectorIndex (GEP, &Alloca, VecEltTy, *DL);
879
+ Value *Index = GEPToVectorIndex (GEP, &Alloca, VecEltTy, *DL, NewGEPInsts );
819
880
if (!Index)
820
881
return RejectUser (Inst, " cannot compute vector index for GEP" );
821
882
0 commit comments