Skip to content

[AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector #84735

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 108 additions & 26 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,15 @@
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
Expand All @@ -64,10 +66,17 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
cl::desc("Maximum byte size to consider promote alloca to vector"),
cl::init(0));

static cl::opt<unsigned>
LoopUserWeight("promote-alloca-vector-loop-user-weight",
cl::desc("The bonus weight of users of allocas within loop "
"when sorting profitable allocas"),
cl::init(4));

// Shared implementation which can do both promotion to vector and to LDS.
class AMDGPUPromoteAllocaImpl {
private:
const TargetMachine &TM;
LoopInfo &LI;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;

Expand Down Expand Up @@ -101,8 +110,11 @@ class AMDGPUPromoteAllocaImpl {
bool tryPromoteAllocaToVector(AllocaInst &I);
bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);

void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);

public:
AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {

const Triple &TT = TM.getTargetTriple();
IsAMDGCN = TT.getArch() == Triple::amdgcn;
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
Expand All @@ -122,7 +134,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
if (skipFunction(F))
return false;
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
return AMDGPUPromoteAllocaImpl(
TPC->getTM<TargetMachine>(),
getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
.run(F, /*PromoteToLDS*/ true);
return false;
}
Expand All @@ -131,6 +145,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
};
Expand All @@ -145,7 +160,9 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
if (skipFunction(F))
return false;
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
return AMDGPUPromoteAllocaImpl(
TPC->getTM<TargetMachine>(),
getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
.run(F, /*PromoteToLDS*/ false);
return false;
}
Expand All @@ -156,6 +173,7 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
};
Expand Down Expand Up @@ -186,18 +204,23 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
// Move LDS uses from functions to kernels before promote alloca for accurate
// estimation of LDS available
INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
"AMDGPU promote alloca to vector or LDS", false, false)

INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
"AMDGPU promote alloca to vector", false, false)
INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
"AMDGPU promote alloca to vector", false, false)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
"AMDGPU promote alloca to vector", false, false)

char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;

PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
FunctionAnalysisManager &AM) {
bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
auto &LI = AM.getResult<LoopAnalysis>(F);
bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
Expand All @@ -208,7 +231,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,

PreservedAnalyses
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
auto &LI = AM.getResult<LoopAnalysis>(F);
bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
Expand All @@ -225,6 +249,55 @@ FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
return new AMDGPUPromoteAllocaToVector();
}

static void collectAllocaUses(AllocaInst &Alloca,
SmallVectorImpl<Use *> &Uses) {
SmallVector<Instruction *, 4> WorkList({&Alloca});
while (!WorkList.empty()) {
auto *Cur = WorkList.pop_back_val();
for (auto &U : Cur->uses()) {
Uses.push_back(&U);

if (isa<GetElementPtrInst>(U.getUser()))
WorkList.push_back(cast<Instruction>(U.getUser()));
}
}
}

void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
SmallVectorImpl<AllocaInst *> &Allocas) {
DenseMap<AllocaInst *, unsigned> Scores;

for (auto *Alloca : Allocas) {
LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
unsigned &Score = Scores[Alloca];
// Increment score by one for each user + a bonus for users within loops.
SmallVector<Use *, 8> Uses;
collectAllocaUses(*Alloca, Uses);
for (auto *U : Uses) {
Instruction *Inst = cast<Instruction>(U->getUser());
if (isa<GetElementPtrInst>(Inst))
continue;
unsigned UserScore =
1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
Score += UserScore;
}
LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
}

stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
return Scores.at(A) > Scores.at(B);
});

// clang-format off
LLVM_DEBUG(
dbgs() << "Sorted Worklist:\n";
for (auto *A: Allocas)
dbgs() << " " << *A << "\n";
);
// clang-format on
}

bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
Mod = F.getParent();
DL = &Mod->getDataLayout();
Expand All @@ -237,6 +310,13 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {

bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;

// Use up to 1/4 of available register budget for vectorization.
// FIXME: Increase the limit for whole function budgets? Perhaps x2?
unsigned VectorizationBudget =
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
: (MaxVGPRs * 32)) /
4;

SmallVector<AllocaInst *, 16> Allocas;
for (Instruction &I : F.getEntryBlock()) {
if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
Expand All @@ -248,11 +328,27 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
}
}

sortAllocasToPromote(Allocas);

bool Changed = false;
for (AllocaInst *AI : Allocas) {
if (tryPromoteAllocaToVector(*AI))
const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
if (AllocaCost > VectorizationBudget) {
LLVM_DEBUG(dbgs() << " Alloca too big for vectorization: " << *AI
<< "\n");
return false;
}

if (tryPromoteAllocaToVector(*AI)) {
Changed = true;
else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
"Underflow!");
VectorizationBudget -= AllocaCost;
LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
<< VectorizationBudget << "\n");
if (VectorizationBudget == 0)
break;
} else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
Changed = true;
}

Expand Down Expand Up @@ -641,16 +737,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
ArrayTy->getNumElements());
}

// Use up to 1/4 of available register budget for vectorization.
unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
: (MaxVGPRs * 32);

if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " << MaxVGPRs
<< " registers available\n");
return false;
}

// FIXME: There is no reason why we can't support larger arrays, we
// are just being conservative for now.
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
Expand All @@ -671,7 +757,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
SmallVector<Instruction *> WorkList;
SmallVector<Instruction *> UsersToRemove;
SmallVector<Instruction *> DeferredInsts;
SmallVector<Use *, 8> Uses;
DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;

const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
Expand All @@ -680,15 +765,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
return false;
};

for (Use &U : Alloca.uses())
Uses.push_back(&U);
SmallVector<Use *, 8> Uses;
collectAllocaUses(Alloca, Uses);

LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");

Type *VecEltTy = VectorTy->getElementType();
unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
while (!Uses.empty()) {
Use *U = Uses.pop_back_val();
for (auto *U : Uses) {
Instruction *Inst = cast<Instruction>(U->getUser());

if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
Expand Down Expand Up @@ -732,8 +816,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
return RejectUser(Inst, "cannot compute vector index for GEP");

GEPVectorIdx[GEP] = Index;
for (Use &U : Inst->uses())
Uses.push_back(&U);
UsersToRemove.push_back(Inst);
continue;
}
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -195,13 +195,13 @@
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU atomic optimizations
; GCN-O1-NEXT: Expand Atomic instructions
; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Cycle Info Analysis
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU IR optimizations
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: Canonicalize natural loops
; GCN-O1-NEXT: Scalar Evolution Analysis
; GCN-O1-NEXT: Loop Pass Manager
Expand Down Expand Up @@ -470,9 +470,9 @@
; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations
; GCN-O1-OPTS-NEXT: Expand Atomic instructions
; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Canonicalize natural loops
; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis
; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis
Expand Down Expand Up @@ -775,9 +775,9 @@
; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: AMDGPU atomic optimizations
; GCN-O2-NEXT: Expand Atomic instructions
; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O2-NEXT: Scalar Evolution Analysis
; GCN-O2-NEXT: Straight line strength reduction
Expand Down Expand Up @@ -1084,9 +1084,9 @@
; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: AMDGPU atomic optimizations
; GCN-O3-NEXT: Expand Atomic instructions
; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O3-NEXT: Scalar Evolution Analysis
; GCN-O3-NEXT: Straight line strength reduction
Expand Down
69 changes: 69 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
; REQUIRES: asserts

; CHECK: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %simpleuser, align 4
; CHECK-NEXT: => Final Score:1
; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: [+1]: store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4
; CHECK-NEXT: [+1]: %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
; CHECK-NEXT: [+1]: store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4
; CHECK-NEXT: [+1]: %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
; CHECK-NEXT: => Final Score:4
; CHECK-NEXT: Sorted Worklist:
; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
define amdgpu_kernel void @simple_users_scores() #0 {
entry:
; should get a score of 1
%simpleuser = alloca [4 x i64], align 4, addrspace(5)
; should get a score of 4
%manyusers = alloca [4 x i64], align 4, addrspace(5)

store i32 42, ptr addrspace(5) %simpleuser

%manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
%v0 = load i8, ptr addrspace(5) %manyusers.1
%v0.ext = zext i8 %v0 to i32
store i32 %v0.ext, ptr addrspace(5) %manyusers.1

%manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
%v1 = load i8, ptr addrspace(5) %manyusers.2
%v1.ext = zext i8 %v0 to i32
store i32 %v1.ext, ptr addrspace(5) %manyusers.2

ret void
}

; CHECK: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: [+5]: store i32 32, ptr addrspace(5) %stack, align 4
; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %stack, align 4
; CHECK-NEXT: [+9]: store i32 32, ptr addrspace(5) %stack.1, align 4
; CHECK-NEXT: [+5]: %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1
; CHECK-NEXT: [+1]: store i32 64, ptr addrspace(5) %stack.2, align 4
; CHECK-NEXT: [+9]: %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1
; CHECK-NEXT: => Final Score:30
define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
entry:
; should get a score of 1
%stack = alloca [4 x i64], align 4, addrspace(5)
%stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
%stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8

store i32 42, ptr addrspace(5) %stack
br label %loop.outer

loop.outer:
store i32 32, ptr addrspace(5) %stack
%outer.cmp = load i1, ptr addrspace(5) %stack.1
br label %loop.inner

loop.inner:
store i32 32, ptr addrspace(5) %stack.1
%inner.cmp = load i1, ptr addrspace(5) %stack.2
br i1 %inner.cmp, label %loop.inner, label %loop.outer

exit:
store i32 64, ptr addrspace(5) %stack.2
ret void
}