Skip to content

[AMDGPU] Extend promotion of alloca to vectors #127973

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1720,6 +1720,10 @@ The AMDGPU backend supports the following LLVM IR attributes.
"amdgpu-sgpr-hazard-mem-wait-cull-threshold" Sets the number of active SGPR hazards that must be present before
inserting a cull sequence at a memory wait.

"amdgpu-promote-alloca-to-vector-max-regs" Maximum vector size (in 32b registers) to create when promoting alloca.

"amdgpu-promote-alloca-to-vector-vgpr-ratio" Ratio of VGPRs to budget for promoting alloca to vectors.

================================================ ==========================================================

Calling Conventions
Expand Down
119 changes: 94 additions & 25 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
cl::desc("Maximum byte size to consider promote alloca to vector"),
cl::init(0));

static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
"amdgpu-promote-alloca-to-vector-max-regs",
cl::desc(
"Maximum vector size (in 32b registers) to use when promoting alloca"),
cl::init(16));

// Use up to 1/4 of available register budget for vectorization.
// FIXME: Increase the limit for whole function budgets? Perhaps x2?
static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
"amdgpu-promote-alloca-to-vector-vgpr-ratio",
cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
cl::init(4));

static cl::opt<unsigned>
LoopUserWeight("promote-alloca-vector-loop-user-weight",
cl::desc("The bonus weight of users of allocas within loop "
Expand All @@ -84,6 +97,8 @@ class AMDGPUPromoteAllocaImpl {
uint32_t LocalMemLimit = 0;
uint32_t CurrentLocalMemUsage = 0;
unsigned MaxVGPRs;
unsigned VGPRBudgetRatio;
unsigned MaxVectorRegs;

bool IsAMDGCN = false;
bool IsAMDHSA = false;
Expand Down Expand Up @@ -112,6 +127,8 @@ class AMDGPUPromoteAllocaImpl {

void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);

void setFunctionLimits(const Function &F);

public:
AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {

Expand Down Expand Up @@ -298,6 +315,19 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
// clang-format on
}

void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
// Load per function limits, overriding with global options where appropriate.
MaxVectorRegs = F.getFnAttributeAsParsedInteger(
"amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs);
if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
"amdgpu-promote-alloca-to-vector-vgpr-ratio",
PromoteAllocaToVectorVGPRRatio);
if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
}

bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
Mod = F.getParent();
DL = &Mod->getDataLayout();
Expand All @@ -307,15 +337,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
return false;

MaxVGPRs = getMaxVGPRs(TM, F);
setFunctionLimits(F);

bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;

// Use up to 1/4 of available register budget for vectorization.
// FIXME: Increase the limit for whole function budgets? Perhaps x2?
unsigned VectorizationBudget =
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
: (MaxVGPRs * 32)) /
4;
VGPRBudgetRatio;

SmallVector<AllocaInst *, 16> Allocas;
for (Instruction &I : F.getEntryBlock()) {
Expand Down Expand Up @@ -398,7 +427,8 @@ calculateVectorIndex(Value *Ptr,
}

static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
Type *VecElemTy, const DataLayout &DL) {
Type *VecElemTy, const DataLayout &DL,
SmallVector<Instruction *> &NewInsts) {
// TODO: Extracting a "multiple of X" from a GEP might be a useful generic
// helper.
unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
Expand All @@ -412,22 +442,37 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
if (VarOffsets.size() > 1)
return nullptr;

if (VarOffsets.size() == 1) {
// Only handle cases where we don't need to insert extra arithmetic
// instructions.
const auto &VarOffset = VarOffsets.front();
if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
return nullptr;
return VarOffset.first;
}

APInt Quot;
uint64_t Rem;
APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
if (Rem != 0)
return nullptr;

return ConstantInt::get(GEP->getContext(), Quot);
ConstantInt *ConstIndex = ConstantInt::get(GEP->getContext(), Quot);
if (VarOffsets.size() == 0)
return ConstIndex;

IRBuilder<> Builder(GEP);

const auto &VarOffset = VarOffsets.front();
APInt::udivrem(VarOffset.second, VecElemSize, Quot, Rem);
if (Rem != 0 || Quot.isZero())
return nullptr;

Value *Offset = VarOffset.first;
if (!Quot.isOne()) {
ConstantInt *ConstMul = ConstantInt::get(GEP->getContext(), Quot);
Offset = Builder.CreateMul(Offset, ConstMul);
if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
NewInsts.push_back(NewInst);
}
if (ConstOffset.isZero())
return Offset;

Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
NewInsts.push_back(NewInst);
return IndexAdd;
}

/// Promotes a single user of the alloca to a vector form.
Expand Down Expand Up @@ -735,23 +780,44 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
Type *AllocaTy = Alloca.getAllocatedType();
auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
ArrayTy->getNumElements() > 0)
VectorTy = FixedVectorType::get(ArrayTy->getElementType(),
ArrayTy->getNumElements());
uint64_t NumElems = 1;
Type *ElemTy;
do {
NumElems *= ArrayTy->getNumElements();
ElemTy = ArrayTy->getElementType();
} while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));

// Check for array of vectors
auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
if (InnerVectorTy) {
NumElems *= InnerVectorTy->getNumElements();
ElemTy = InnerVectorTy->getElementType();
}

if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
// Expand vector if required to match padding of inner type,
// i.e. odd size subvectors.
// Storage size of new vector must match that of alloca for correct
// behaviour of byte offsets and GEP computation.
if (NumElems * ElementSize != AllocaSize)
NumElems = AllocaSize / ElementSize;
if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
VectorTy = FixedVectorType::get(ElemTy, NumElems);
}
}

// FIXME: There is no reason why we can't support larger arrays, we
// are just being conservative for now.
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
// equivalent. Potentially these could also be promoted but we don't currently
// handle this case
if (!VectorTy) {
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
return false;
}

if (VectorTy->getNumElements() > 16 || VectorTy->getNumElements() < 2) {
const unsigned MaxElements =
(MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());

if (VectorTy->getNumElements() > MaxElements ||
VectorTy->getNumElements() < 2) {
LLVM_DEBUG(dbgs() << " " << *VectorTy
<< " has an unsupported number of elements\n");
return false;
Expand All @@ -761,11 +827,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
SmallVector<Instruction *> WorkList;
SmallVector<Instruction *> UsersToRemove;
SmallVector<Instruction *> DeferredInsts;
SmallVector<Instruction *> NewGEPInsts;
DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;

const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
<< " " << *Inst << "\n");
for (auto *Inst : reverse(NewGEPInsts))
Inst->eraseFromParent();
return false;
};

Expand Down Expand Up @@ -815,7 +884,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL);
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
if (!Index)
return RejectUser(Inst, "cannot compute vector index for GEP");

Expand Down
9 changes: 7 additions & 2 deletions llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ entry:
; FUNC-LABEL: {{^}}no_overlap:
;
; A total of 5 bytes should be allocated and used.
; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
; SI-ALLOCA: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
define amdgpu_kernel void @no_overlap(ptr addrspace(1) %out, i32 %in) #0 {
entry:
%0 = alloca [3 x i8], align 1, addrspace(5)
Expand All @@ -281,6 +281,7 @@ entry:
ret void
}

; FUNC-LABEL: {{^}}char_array_array:
define amdgpu_kernel void @char_array_array(ptr addrspace(1) %out, i32 %index) #0 {
entry:
%alloca = alloca [2 x [2 x i8]], addrspace(5)
Expand All @@ -294,6 +295,7 @@ entry:
ret void
}

; FUNC-LABEL: {{^}}i32_array_array:
define amdgpu_kernel void @i32_array_array(ptr addrspace(1) %out, i32 %index) #0 {
entry:
%alloca = alloca [2 x [2 x i32]], addrspace(5)
Expand All @@ -306,6 +308,7 @@ entry:
ret void
}

; FUNC-LABEL: {{^}}i64_array_array:
define amdgpu_kernel void @i64_array_array(ptr addrspace(1) %out, i32 %index) #0 {
entry:
%alloca = alloca [2 x [2 x i64]], addrspace(5)
Expand All @@ -319,7 +322,7 @@ entry:
}

%struct.pair32 = type { i32, i32 }

; FUNC-LABEL: {{^}}struct_array_array:
define amdgpu_kernel void @struct_array_array(ptr addrspace(1) %out, i32 %index) #0 {
entry:
%alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5)
Expand All @@ -333,6 +336,7 @@ entry:
ret void
}

; FUNC-LABEL: {{^}}struct_pair32_array:
define amdgpu_kernel void @struct_pair32_array(ptr addrspace(1) %out, i32 %index) #0 {
entry:
%alloca = alloca [2 x %struct.pair32], addrspace(5)
Expand All @@ -346,6 +350,7 @@ entry:
ret void
}

; FUNC-LABEL: {{^}}select_private:
define amdgpu_kernel void @select_private(ptr addrspace(1) %out, i32 %in) nounwind {
entry:
%tmp = alloca [2 x i32], addrspace(5)
Expand Down
8 changes: 1 addition & 7 deletions llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,7 @@ declare void @llvm.amdgcn.s.barrier() #2
; SI-ALLOCA: s_barrier
; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
;
; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
; alloca to a vector. It currently fails because it does not know how
; to interpret:
; getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b

; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
; SI-PROMOTE: ds_write_b32 [[PTRREG]]
; SI-PROMOTE: LDSByteSize: 0
define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) #0 {
%alloca = alloca [16 x i32], align 16, addrspace(5)
%mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
Expand Down
Loading