Address reviewer comments:

perlfu · perlfu · commit 77d30c35e4df · 2025-02-21T19:22:20.000+09:00
- Move options into pass attributes
- Change vector size limit from max elements to max 32b registers
- Add tests for i16, float and ptr in multi-dimensional arrays
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
@@ -1720,7 +1720,7 @@ The AMDGPU backend supports the following LLVM IR attributes.
      "amdgpu-sgpr-hazard-mem-wait-cull-threshold"     Sets the number of active SGPR hazards that must be present before
                                                       inserting a cull sequence at a memory wait.
 
-     "amdgpu-promote-alloca-to-vector-max-elements"   Maximum vector size (in elements) to create when promoting alloca.
+     "amdgpu-promote-alloca-to-vector-max-regs"       Maximum vector size (in 32b registers) to create when promoting alloca.
 
      "amdgpu-promote-alloca-to-vector-vgpr-ratio"     Ratio of VGPRs to budget for promoting alloca to vectors.
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -66,9 +66,10 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
     cl::desc("Maximum byte size to consider promote alloca to vector"),
     cl::init(0));
 
-static cl::opt<unsigned> PromoteAllocaToVectorMaxElements(
-    "amdgpu-promote-alloca-to-vector-max-elements",
-    cl::desc("Maximum vector size (in elements) to use when promoting alloca"),
+static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
+    "amdgpu-promote-alloca-to-vector-max-regs",
+    cl::desc(
+        "Maximum vector size (in 32b registers) to use when promoting alloca"),
     cl::init(16));
 
 // Use up to 1/4 of available register budget for vectorization.
@@ -96,6 +97,8 @@ class AMDGPUPromoteAllocaImpl {
   uint32_t LocalMemLimit = 0;
   uint32_t CurrentLocalMemUsage = 0;
   unsigned MaxVGPRs;
+  unsigned VGPRBudgetRatio;
+  unsigned MaxVectorRegs;
 
   bool IsAMDGCN = false;
   bool IsAMDHSA = false;
@@ -124,6 +127,8 @@ class AMDGPUPromoteAllocaImpl {
 
   void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
 
+  void setFunctionLimits(const Function &F);
+
 public:
   AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
 
@@ -310,6 +315,19 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
   // clang-format on
 }
 
+void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
+  // Load per function limits, overriding with global options where appropriate.
+  MaxVectorRegs = F.getFnAttributeAsParsedInteger(
+      "amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs);
+  if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
+    MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
+  VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
+      "amdgpu-promote-alloca-to-vector-vgpr-ratio",
+      PromoteAllocaToVectorVGPRRatio);
+  if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
+    VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
+}
+
 bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   Mod = F.getParent();
   DL = &Mod->getDataLayout();
@@ -319,20 +337,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
     return false;
 
   MaxVGPRs = getMaxVGPRs(TM, F);
+  setFunctionLimits(F);
 
   bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
 
-  const unsigned VGPRRatio =
-      PromoteAllocaToVectorVGPRRatio.getNumOccurrences()
-          ? PromoteAllocaToVectorVGPRRatio
-          : F.getFnAttributeAsParsedInteger(
-                "amdgpu-promote-alloca-to-vector-vgpr-ratio",
-                PromoteAllocaToVectorVGPRRatio);
-
   unsigned VectorizationBudget =
       (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
                                   : (MaxVGPRs * 32)) /
-      VGPRRatio;
+      VGPRBudgetRatio;
 
   SmallVector<AllocaInst *, 16> Allocas;
   for (Instruction &I : F.getEntryBlock()) {
@@ -802,11 +814,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   }
 
   const unsigned MaxElements =
-      PromoteAllocaToVectorMaxElements.getNumOccurrences()
-          ? PromoteAllocaToVectorMaxElements
-          : Alloca.getParent()->getParent()->getFnAttributeAsParsedInteger(
-                "amdgpu-promote-alloca-to-vector-max-elements",
-                PromoteAllocaToVectorMaxElements);
+      (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
 
   if (VectorTy->getNumElements() > MaxElements ||
       VectorTy->getNumElements() < 2) {
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-elements=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-elements=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32
 
 define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
 ; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements(
@@ -232,5 +232,5 @@ declare i32 @llvm.amdgcn.workitem.id.y()
 declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg)
 
 attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" }
-attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-elements"="24" }
-attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-elements"="32" }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="24" }
+attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="32" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -288,5 +288,104 @@ define amdgpu_kernel void @i32_3d_load_store(ptr %out) {
   ret void
 }
 
+define amdgpu_kernel void @i16_2d_load_store(ptr %out, i32 %sel) {
+; CHECK-LABEL: define amdgpu_kernel void @i16_2d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 3, [[SEL]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5>, i32 [[TMP1]]
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x i16]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  store i16 0, ptr addrspace(5) %gep.00
+  store i16 1, ptr addrspace(5) %gep.01
+  store i16 2, ptr addrspace(5) %gep.02
+  store i16 3, ptr addrspace(5) %gep.10
+  store i16 4, ptr addrspace(5) %gep.11
+  store i16 5, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel
+  %load = load i16, ptr addrspace(5) %gep
+  store i16 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @float_2d_load_store(ptr %out, i32 %sel) {
+; CHECK-LABEL: define amdgpu_kernel void @float_2d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 3, [[SEL]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00>, i32 [[TMP1]]
+; CHECK-NEXT:    store float [[TMP2]], ptr [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x float]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  store float 0.0, ptr addrspace(5) %gep.00
+  store float 1.0, ptr addrspace(5) %gep.01
+  store float 2.0, ptr addrspace(5) %gep.02
+  store float 3.0, ptr addrspace(5) %gep.10
+  store float 4.0, ptr addrspace(5) %gep.11
+  store float 5.0, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel
+  %load = load float, ptr addrspace(5) %gep
+  store float %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @ptr_2d_load_store(ptr %out, i32 %sel) {
+; CHECK-LABEL: define amdgpu_kernel void @ptr_2d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) {
+; CHECK-NEXT:    [[PTR_0:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 0
+; CHECK-NEXT:    [[PTR_1:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 1
+; CHECK-NEXT:    [[PTR_2:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 2
+; CHECK-NEXT:    [[PTR_3:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 3
+; CHECK-NEXT:    [[PTR_4:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 4
+; CHECK-NEXT:    [[PTR_5:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 5
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x ptr> undef, ptr [[PTR_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <6 x ptr> [[TMP1]], ptr [[PTR_1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <6 x ptr> [[TMP2]], ptr [[PTR_2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <6 x ptr> [[TMP3]], ptr [[PTR_3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <6 x ptr> [[TMP4]], ptr [[PTR_4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <6 x ptr> [[TMP5]], ptr [[PTR_5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 3, [[SEL]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x ptr> [[TMP6]], i32 [[TMP7]]
+; CHECK-NEXT:    store ptr [[TMP8]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  %ptr.0 = getelementptr inbounds ptr, ptr %out, i32 0
+  %ptr.1 = getelementptr inbounds ptr, ptr %out, i32 1
+  %ptr.2 = getelementptr inbounds ptr, ptr %out, i32 2
+  %ptr.3 = getelementptr inbounds ptr, ptr %out, i32 3
+  %ptr.4 = getelementptr inbounds ptr, ptr %out, i32 4
+  %ptr.5 = getelementptr inbounds ptr, ptr %out, i32 5
+  store ptr %ptr.0, ptr addrspace(5) %gep.00
+  store ptr %ptr.1, ptr addrspace(5) %gep.01
+  store ptr %ptr.2, ptr addrspace(5) %gep.02
+  store ptr %ptr.3, ptr addrspace(5) %gep.10
+  store ptr %ptr.4, ptr addrspace(5) %gep.11
+  store ptr %ptr.5, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel
+  %load = load ptr, ptr addrspace(5) %gep
+  store ptr %load, ptr %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.workitem.id.y()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
@@ -269,8 +269,8 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.workitem.id.y()
 declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg)
 
-attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" }
-attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" }
-attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" }
+attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" }
+attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" }
+attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; BASE: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-max-regs=64 < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 -amdgpu-promote-alloca-to-vector-max-regs=64 < %s | FileCheck -check-prefix=LIMIT32 %s
 
 target datalayout = "A5"