Skip to content

Commit 9293d1a

Browse files
committed
[AMDGPU] Extend promotion of alloca to vectors
* Add multi dimensional array support * Make maximum vector size tunable * Make ratio of VGPRs used for vector promotion tunable
1 parent 5815990 commit 9293d1a

9 files changed

+907
-45
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1720,6 +1720,10 @@ The AMDGPU backend supports the following LLVM IR attributes.
17201720
"amdgpu-sgpr-hazard-mem-wait-cull-threshold" Sets the number of active SGPR hazards that must be present before
17211721
inserting a cull sequence at a memory wait.
17221722

1723+
"amdgpu-promote-alloca-to-vector-max-elements" Maximum vector size (in elements) to create when promoting alloca.
1724+
1725+
"amdgpu-promote-alloca-to-vector-vgpr-ratio" Ratio of VGPRs to budget for promoting alloca to vectors.
1726+
17231727
================================================ ==========================================================
17241728

17251729
Calling Conventions

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 86 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,18 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
6666
cl::desc("Maximum byte size to consider promote alloca to vector"),
6767
cl::init(0));
6868

69+
static cl::opt<unsigned> PromoteAllocaToVectorMaxElements(
70+
"amdgpu-promote-alloca-to-vector-max-elements",
71+
cl::desc("Maximum vector size (in elements) to use when promoting alloca"),
72+
cl::init(16));
73+
74+
// Use up to 1/4 of available register budget for vectorization.
75+
// FIXME: Increase the limit for whole function budgets? Perhaps x2?
76+
static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
77+
"amdgpu-promote-alloca-to-vector-vgpr-ratio",
78+
cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
79+
cl::init(4));
80+
6981
static cl::opt<unsigned>
7082
LoopUserWeight("promote-alloca-vector-loop-user-weight",
7183
cl::desc("The bonus weight of users of allocas within loop "
@@ -310,12 +322,17 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
310322

311323
bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
312324

313-
// Use up to 1/4 of available register budget for vectorization.
314-
// FIXME: Increase the limit for whole function budgets? Perhaps x2?
325+
const unsigned VGPRRatio =
326+
PromoteAllocaToVectorVGPRRatio.getNumOccurrences()
327+
? PromoteAllocaToVectorVGPRRatio
328+
: F.getFnAttributeAsParsedInteger(
329+
"amdgpu-promote-alloca-to-vector-vgpr-ratio",
330+
PromoteAllocaToVectorVGPRRatio);
331+
315332
unsigned VectorizationBudget =
316333
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
317334
: (MaxVGPRs * 32)) /
318-
4;
335+
VGPRRatio;
319336

320337
SmallVector<AllocaInst *, 16> Allocas;
321338
for (Instruction &I : F.getEntryBlock()) {
@@ -398,7 +415,8 @@ calculateVectorIndex(Value *Ptr,
398415
}
399416

400417
static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
401-
Type *VecElemTy, const DataLayout &DL) {
418+
Type *VecElemTy, const DataLayout &DL,
419+
SmallVector<Instruction *> &NewInsts) {
402420
// TODO: Extracting a "multiple of X" from a GEP might be a useful generic
403421
// helper.
404422
unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
@@ -412,22 +430,37 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
412430
if (VarOffsets.size() > 1)
413431
return nullptr;
414432

415-
if (VarOffsets.size() == 1) {
416-
// Only handle cases where we don't need to insert extra arithmetic
417-
// instructions.
418-
const auto &VarOffset = VarOffsets.front();
419-
if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
420-
return nullptr;
421-
return VarOffset.first;
422-
}
423-
424433
APInt Quot;
425434
uint64_t Rem;
426435
APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
427436
if (Rem != 0)
428437
return nullptr;
429438

430-
return ConstantInt::get(GEP->getContext(), Quot);
439+
ConstantInt *ConstIndex = ConstantInt::get(GEP->getContext(), Quot);
440+
if (VarOffsets.size() == 0)
441+
return ConstIndex;
442+
443+
IRBuilder<> Builder(GEP);
444+
445+
const auto &VarOffset = VarOffsets.front();
446+
APInt::udivrem(VarOffset.second, VecElemSize, Quot, Rem);
447+
if (Rem != 0 || Quot.isZero())
448+
return nullptr;
449+
450+
Value *Offset = VarOffset.first;
451+
if (!Quot.isOne()) {
452+
ConstantInt *ConstMul = ConstantInt::get(GEP->getContext(), Quot);
453+
Offset = Builder.CreateMul(Offset, ConstMul);
454+
if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
455+
NewInsts.push_back(NewInst);
456+
}
457+
if (ConstOffset.isZero())
458+
return Offset;
459+
460+
Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
461+
if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
462+
NewInsts.push_back(NewInst);
463+
return IndexAdd;
431464
}
432465

433466
/// Promotes a single user of the alloca to a vector form.
@@ -735,23 +768,48 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
735768
Type *AllocaTy = Alloca.getAllocatedType();
736769
auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
737770
if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
738-
if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
739-
ArrayTy->getNumElements() > 0)
740-
VectorTy = FixedVectorType::get(ArrayTy->getElementType(),
741-
ArrayTy->getNumElements());
771+
uint64_t NumElems = 1;
772+
Type *ElemTy;
773+
do {
774+
NumElems *= ArrayTy->getNumElements();
775+
ElemTy = ArrayTy->getElementType();
776+
} while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
777+
778+
// Check for array of vectors
779+
auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
780+
if (InnerVectorTy) {
781+
NumElems *= InnerVectorTy->getNumElements();
782+
ElemTy = InnerVectorTy->getElementType();
783+
}
784+
785+
if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
786+
unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
787+
unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
788+
// Expand vector if required to match padding of inner type,
789+
// i.e. odd size subvectors.
790+
// Storage size of new vector must match that of alloca for correct
791+
// behaviour of byte offsets and GEP computation.
792+
if (NumElems * ElementSize != AllocaSize)
793+
NumElems = AllocaSize / ElementSize;
794+
if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
795+
VectorTy = FixedVectorType::get(ElemTy, NumElems);
796+
}
742797
}
743798

744-
// FIXME: There is no reason why we can't support larger arrays, we
745-
// are just being conservative for now.
746-
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
747-
// equivalent. Potentially these could also be promoted but we don't currently
748-
// handle this case
749799
if (!VectorTy) {
750800
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
751801
return false;
752802
}
753803

754-
if (VectorTy->getNumElements() > 16 || VectorTy->getNumElements() < 2) {
804+
const unsigned MaxElements =
805+
PromoteAllocaToVectorMaxElements.getNumOccurrences()
806+
? PromoteAllocaToVectorMaxElements
807+
: Alloca.getParent()->getParent()->getFnAttributeAsParsedInteger(
808+
"amdgpu-promote-alloca-to-vector-max-elements",
809+
PromoteAllocaToVectorMaxElements);
810+
811+
if (VectorTy->getNumElements() > MaxElements ||
812+
VectorTy->getNumElements() < 2) {
755813
LLVM_DEBUG(dbgs() << " " << *VectorTy
756814
<< " has an unsupported number of elements\n");
757815
return false;
@@ -761,11 +819,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
761819
SmallVector<Instruction *> WorkList;
762820
SmallVector<Instruction *> UsersToRemove;
763821
SmallVector<Instruction *> DeferredInsts;
822+
SmallVector<Instruction *> NewGEPInsts;
764823
DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
765824

766825
const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
767826
LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
768827
<< " " << *Inst << "\n");
828+
for (auto *Inst : reverse(NewGEPInsts))
829+
Inst->eraseFromParent();
769830
return false;
770831
};
771832

@@ -815,7 +876,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
815876
if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
816877
// If we can't compute a vector index from this GEP, then we can't
817878
// promote this alloca to vector.
818-
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL);
879+
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
819880
if (!Index)
820881
return RejectUser(Inst, "cannot compute vector index for GEP");
821882

llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ entry:
258258
; FUNC-LABEL: {{^}}no_overlap:
259259
;
260260
; A total of 5 bytes should be allocated and used.
261-
; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
261+
; SI-ALLOCA: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
262262
define amdgpu_kernel void @no_overlap(ptr addrspace(1) %out, i32 %in) #0 {
263263
entry:
264264
%0 = alloca [3 x i8], align 1, addrspace(5)
@@ -281,6 +281,7 @@ entry:
281281
ret void
282282
}
283283

284+
; FUNC-LABEL: {{^}}char_array_array:
284285
define amdgpu_kernel void @char_array_array(ptr addrspace(1) %out, i32 %index) #0 {
285286
entry:
286287
%alloca = alloca [2 x [2 x i8]], addrspace(5)
@@ -294,6 +295,7 @@ entry:
294295
ret void
295296
}
296297

298+
; FUNC-LABEL: {{^}}i32_array_array:
297299
define amdgpu_kernel void @i32_array_array(ptr addrspace(1) %out, i32 %index) #0 {
298300
entry:
299301
%alloca = alloca [2 x [2 x i32]], addrspace(5)
@@ -306,6 +308,7 @@ entry:
306308
ret void
307309
}
308310

311+
; FUNC-LABEL: {{^}}i64_array_array:
309312
define amdgpu_kernel void @i64_array_array(ptr addrspace(1) %out, i32 %index) #0 {
310313
entry:
311314
%alloca = alloca [2 x [2 x i64]], addrspace(5)
@@ -319,7 +322,7 @@ entry:
319322
}
320323

321324
%struct.pair32 = type { i32, i32 }
322-
325+
; FUNC-LABEL: {{^}}struct_array_array:
323326
define amdgpu_kernel void @struct_array_array(ptr addrspace(1) %out, i32 %index) #0 {
324327
entry:
325328
%alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5)
@@ -333,6 +336,7 @@ entry:
333336
ret void
334337
}
335338

339+
; FUNC-LABEL: {{^}}struct_pair32_array:
336340
define amdgpu_kernel void @struct_pair32_array(ptr addrspace(1) %out, i32 %index) #0 {
337341
entry:
338342
%alloca = alloca [2 x %struct.pair32], addrspace(5)
@@ -346,6 +350,7 @@ entry:
346350
ret void
347351
}
348352

353+
; FUNC-LABEL: {{^}}select_private:
349354
define amdgpu_kernel void @select_private(ptr addrspace(1) %out, i32 %in) nounwind {
350355
entry:
351356
%tmp = alloca [2 x i32], addrspace(5)

llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,7 @@ declare void @llvm.amdgcn.s.barrier() #2
2222
; SI-ALLOCA: s_barrier
2323
; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
2424
;
25-
; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
26-
; alloca to a vector. It currently fails because it does not know how
27-
; to interpret:
28-
; getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b
29-
30-
; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
31-
; SI-PROMOTE: ds_write_b32 [[PTRREG]]
25+
; SI-PROMOTE: LDSByteSize: 0
3226
define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) #0 {
3327
%alloca = alloca [16 x i32], align 16, addrspace(5)
3428
%mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);

0 commit comments

Comments
 (0)