[AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector (llvm#84735)

Pierre-vh · chencha3 · commit 061730b69746 · 2024-03-22T14:49:23.000-05:00
Update PromoteAllocaToVector so it considers the whole function before promoting allocas.
Allocas are scored &amp; sorted so the highest value ones are seen first. The budget is now per function instead of per alloca.

Passed internal performance testing.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -32,13 +32,15 @@
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -64,10 +66,17 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
     cl::desc("Maximum byte size to consider promote alloca to vector"),
     cl::init(0));
 
+static cl::opt<unsigned>
+    LoopUserWeight("promote-alloca-vector-loop-user-weight",
+                   cl::desc("The bonus weight of users of allocas within loop "
+                            "when sorting profitable allocas"),
+                   cl::init(4));
+
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
 private:
   const TargetMachine &TM;
+  LoopInfo &LI;
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
 
@@ -101,8 +110,11 @@ class AMDGPUPromoteAllocaImpl {
   bool tryPromoteAllocaToVector(AllocaInst &I);
   bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
 
+  void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
+
 public:
-  AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
+  AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
+
     const Triple &TT = TM.getTargetTriple();
     IsAMDGCN = TT.getArch() == Triple::amdgcn;
     IsAMDHSA = TT.getOS() == Triple::AMDHSA;
@@ -122,7 +134,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
     if (skipFunction(F))
       return false;
     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-      return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+      return AMDGPUPromoteAllocaImpl(
+                 TPC->getTM<TargetMachine>(),
+                 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
           .run(F, /*PromoteToLDS*/ true);
     return false;
   }
@@ -131,6 +145,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -145,7 +160,9 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
     if (skipFunction(F))
       return false;
     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-      return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+      return AMDGPUPromoteAllocaImpl(
+                 TPC->getTM<TargetMachine>(),
+                 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
           .run(F, /*PromoteToLDS*/ false);
     return false;
   }
@@ -156,6 +173,7 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -186,18 +204,23 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
 // Move LDS uses from functions to kernels before promote alloca for accurate
 // estimation of LDS available
 INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
                     "AMDGPU promote alloca to vector or LDS", false, false)
 
-INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
-                "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+                      "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+                    "AMDGPU promote alloca to vector", false, false)
 
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
 char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
 
 PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
-  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -208,7 +231,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
 
 PreservedAnalyses
 AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
-  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -225,6 +249,55 @@ FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
   return new AMDGPUPromoteAllocaToVector();
 }
 
+static void collectAllocaUses(AllocaInst &Alloca,
+                              SmallVectorImpl<Use *> &Uses) {
+  SmallVector<Instruction *, 4> WorkList({&Alloca});
+  while (!WorkList.empty()) {
+    auto *Cur = WorkList.pop_back_val();
+    for (auto &U : Cur->uses()) {
+      Uses.push_back(&U);
+
+      if (isa<GetElementPtrInst>(U.getUser()))
+        WorkList.push_back(cast<Instruction>(U.getUser()));
+    }
+  }
+}
+
+void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
+    SmallVectorImpl<AllocaInst *> &Allocas) {
+  DenseMap<AllocaInst *, unsigned> Scores;
+
+  for (auto *Alloca : Allocas) {
+    LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
+    unsigned &Score = Scores[Alloca];
+    // Increment score by one for each user + a bonus for users within loops.
+    SmallVector<Use *, 8> Uses;
+    collectAllocaUses(*Alloca, Uses);
+    for (auto *U : Uses) {
+      Instruction *Inst = cast<Instruction>(U->getUser());
+      if (isa<GetElementPtrInst>(Inst))
+        continue;
+      unsigned UserScore =
+          1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
+      LLVM_DEBUG(dbgs() << "  [+" << UserScore << "]:\t" << *Inst << "\n");
+      Score += UserScore;
+    }
+    LLVM_DEBUG(dbgs() << "  => Final Score:" << Score << "\n");
+  }
+
+  stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
+    return Scores.at(A) > Scores.at(B);
+  });
+
+  // clang-format off
+  LLVM_DEBUG(
+    dbgs() << "Sorted Worklist:\n";
+    for (auto *A: Allocas)
+      dbgs() << "  " << *A << "\n";
+  );
+  // clang-format on
+}
+
 bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   Mod = F.getParent();
   DL = &Mod->getDataLayout();
@@ -237,6 +310,13 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
 
   bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
 
+  // Use up to 1/4 of available register budget for vectorization.
+  // FIXME: Increase the limit for whole function budgets? Perhaps x2?
+  unsigned VectorizationBudget =
+      (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+                                  : (MaxVGPRs * 32)) /
+      4;
+
   SmallVector<AllocaInst *, 16> Allocas;
   for (Instruction &I : F.getEntryBlock()) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
@@ -248,11 +328,27 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
     }
   }
 
+  sortAllocasToPromote(Allocas);
+
   bool Changed = false;
   for (AllocaInst *AI : Allocas) {
-    if (tryPromoteAllocaToVector(*AI))
+    const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
+    if (AllocaCost > VectorizationBudget) {
+      LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization: " << *AI
+                        << "\n");
+      return false;
+    }
+
+    if (tryPromoteAllocaToVector(*AI)) {
       Changed = true;
-    else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+      assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
+             "Underflow!");
+      VectorizationBudget -= AllocaCost;
+      LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
+                        << VectorizationBudget << "\n");
+      if (VectorizationBudget == 0)
+        break;
+    } else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
       Changed = true;
   }
 
@@ -641,16 +737,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
                                       ArrayTy->getNumElements());
   }
 
-  // Use up to 1/4 of available register budget for vectorization.
-  unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
-                                              : (MaxVGPRs * 32);
-
-  if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
-    LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with " << MaxVGPRs
-                      << " registers available\n");
-    return false;
-  }
-
   // FIXME: There is no reason why we can't support larger arrays, we
   // are just being conservative for now.
   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
@@ -671,7 +757,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   SmallVector<Instruction *> WorkList;
   SmallVector<Instruction *> UsersToRemove;
   SmallVector<Instruction *> DeferredInsts;
-  SmallVector<Use *, 8> Uses;
   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
 
   const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
@@ -680,15 +765,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     return false;
   };
 
-  for (Use &U : Alloca.uses())
-    Uses.push_back(&U);
+  SmallVector<Use *, 8> Uses;
+  collectAllocaUses(Alloca, Uses);
 
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
   Type *VecEltTy = VectorTy->getElementType();
   unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
-  while (!Uses.empty()) {
-    Use *U = Uses.pop_back_val();
+  for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
     if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
@@ -732,8 +816,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
       GEPVectorIdx[GEP] = Index;
-      for (Use &U : Inst->uses())
-        Uses.push_back(&U);
       UsersToRemove.push_back(Inst);
       continue;
     }
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -195,13 +195,13 @@
 ; GCN-O1-NEXT:      Uniformity Analysis
 ; GCN-O1-NEXT:      AMDGPU atomic optimizations
 ; GCN-O1-NEXT:      Expand Atomic instructions
-; GCN-O1-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-NEXT:      Dominator Tree Construction
+; GCN-O1-NEXT:      Natural Loop Information
+; GCN-O1-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-NEXT:      Cycle Info Analysis
 ; GCN-O1-NEXT:      Uniformity Analysis
 ; GCN-O1-NEXT:      AMDGPU IR optimizations
 ; GCN-O1-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT:      Natural Loop Information
 ; GCN-O1-NEXT:      Canonicalize natural loops
 ; GCN-O1-NEXT:      Scalar Evolution Analysis
 ; GCN-O1-NEXT:      Loop Pass Manager
@@ -470,9 +470,9 @@
 ; GCN-O1-OPTS-NEXT:      Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:      AMDGPU atomic optimizations
 ; GCN-O1-OPTS-NEXT:      Expand Atomic instructions
-; GCN-O1-OPTS-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:      Natural Loop Information
+; GCN-O1-OPTS-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-OPTS-NEXT:      Canonicalize natural loops
 ; GCN-O1-OPTS-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O1-OPTS-NEXT:      Lazy Block Frequency Analysis
@@ -775,9 +775,9 @@
 ; GCN-O2-NEXT:      Uniformity Analysis
 ; GCN-O2-NEXT:      AMDGPU atomic optimizations
 ; GCN-O2-NEXT:      Expand Atomic instructions
-; GCN-O2-NEXT:      AMDGPU Promote Alloca
 ; GCN-O2-NEXT:      Dominator Tree Construction
 ; GCN-O2-NEXT:      Natural Loop Information
+; GCN-O2-NEXT:      AMDGPU Promote Alloca
 ; GCN-O2-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
 ; GCN-O2-NEXT:      Scalar Evolution Analysis
 ; GCN-O2-NEXT:      Straight line strength reduction
@@ -1084,9 +1084,9 @@
 ; GCN-O3-NEXT:      Uniformity Analysis
 ; GCN-O3-NEXT:      AMDGPU atomic optimizations
 ; GCN-O3-NEXT:      Expand Atomic instructions
-; GCN-O3-NEXT:      AMDGPU Promote Alloca
 ; GCN-O3-NEXT:      Dominator Tree Construction
 ; GCN-O3-NEXT:      Natural Loop Information
+; GCN-O3-NEXT:      AMDGPU Promote Alloca
 ; GCN-O3-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
 ; GCN-O3-NEXT:      Scalar Evolution Analysis
 ; GCN-O3-NEXT:      Straight line strength reduction
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK:      Scoring:   %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %simpleuser, align 4
+; CHECK-NEXT:   => Final Score:1
+; CHECK-NEXT: Scoring:   %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+1]:   store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4
+; CHECK-NEXT:   [+1]:   %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
+; CHECK-NEXT:   [+1]:   store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4
+; CHECK-NEXT:   [+1]:   %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
+; CHECK-NEXT:   => Final Score:4
+; CHECK-NEXT: Sorted Worklist:
+; CHECK-NEXT:     %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:     %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+define amdgpu_kernel void @simple_users_scores() #0 {
+entry:
+  ; should get a score of 1
+  %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+  ; should get a score of 4
+  %manyusers = alloca [4 x i64], align 4, addrspace(5)
+
+  store i32 42, ptr addrspace(5) %simpleuser
+
+  %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
+  %v0 = load i8, ptr addrspace(5)  %manyusers.1
+  %v0.ext = zext i8 %v0 to i32
+  store i32 %v0.ext, ptr addrspace(5) %manyusers.1
+
+  %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
+  %v1 = load i8, ptr addrspace(5)  %manyusers.2
+  %v1.ext = zext i8 %v0 to i32
+  store i32 %v1.ext, ptr addrspace(5) %manyusers.2
+
+  ret void
+}
+
+; CHECK:      Scoring:   %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+5]:   store i32 32, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT:   [+9]:   store i32 32, ptr addrspace(5) %stack.1, align 4
+; CHECK-NEXT:   [+5]:   %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1
+; CHECK-NEXT:   [+1]:   store i32 64, ptr addrspace(5) %stack.2, align 4
+; CHECK-NEXT:   [+9]:   %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1
+; CHECK-NEXT:   => Final Score:30
+define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
+entry:
+  ; should get a score of 1
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
+  %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+
+  store i32 42, ptr addrspace(5) %stack
+  br label %loop.outer
+
+loop.outer:
+  store i32 32, ptr addrspace(5) %stack
+  %outer.cmp = load i1, ptr addrspace(5) %stack.1
+  br label %loop.inner
+
+loop.inner:
+  store i32 32, ptr addrspace(5) %stack.1
+  %inner.cmp = load i1, ptr addrspace(5) %stack.2
+  br i1 %inner.cmp, label %loop.inner, label %loop.outer
+
+exit:
+  store i32 64, ptr addrspace(5) %stack.2
+  ret void
+}