Skip to content

Commit 77d30c3

Browse files
committed
Address reviewer comments:
- Move options into pass attributes - Change vector size limit from max elements to max 32b registers - Add tests for i16, float and ptr in multi-dimensional arrays
1 parent 9293d1a commit 77d30c3

File tree

7 files changed

+134
-27
lines changed

7 files changed

+134
-27
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1720,7 +1720,7 @@ The AMDGPU backend supports the following LLVM IR attributes.
17201720
"amdgpu-sgpr-hazard-mem-wait-cull-threshold" Sets the number of active SGPR hazards that must be present before
17211721
inserting a cull sequence at a memory wait.
17221722

1723-
"amdgpu-promote-alloca-to-vector-max-elements" Maximum vector size (in elements) to create when promoting alloca.
1723+
"amdgpu-promote-alloca-to-vector-max-regs" Maximum vector size (in 32b registers) to create when promoting alloca.
17241724

17251725
"amdgpu-promote-alloca-to-vector-vgpr-ratio" Ratio of VGPRs to budget for promoting alloca to vectors.
17261726

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,10 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
6666
cl::desc("Maximum byte size to consider promote alloca to vector"),
6767
cl::init(0));
6868

69-
static cl::opt<unsigned> PromoteAllocaToVectorMaxElements(
70-
"amdgpu-promote-alloca-to-vector-max-elements",
71-
cl::desc("Maximum vector size (in elements) to use when promoting alloca"),
69+
static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
70+
"amdgpu-promote-alloca-to-vector-max-regs",
71+
cl::desc(
72+
"Maximum vector size (in 32b registers) to use when promoting alloca"),
7273
cl::init(16));
7374

7475
// Use up to 1/4 of available register budget for vectorization.
@@ -96,6 +97,8 @@ class AMDGPUPromoteAllocaImpl {
9697
uint32_t LocalMemLimit = 0;
9798
uint32_t CurrentLocalMemUsage = 0;
9899
unsigned MaxVGPRs;
100+
unsigned VGPRBudgetRatio;
101+
unsigned MaxVectorRegs;
99102

100103
bool IsAMDGCN = false;
101104
bool IsAMDHSA = false;
@@ -124,6 +127,8 @@ class AMDGPUPromoteAllocaImpl {
124127

125128
void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
126129

130+
void setFunctionLimits(const Function &F);
131+
127132
public:
128133
AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
129134

@@ -310,6 +315,19 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
310315
// clang-format on
311316
}
312317

318+
void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
319+
// Load per function limits, overriding with global options where appropriate.
320+
MaxVectorRegs = F.getFnAttributeAsParsedInteger(
321+
"amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs);
322+
if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
323+
MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
324+
VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
325+
"amdgpu-promote-alloca-to-vector-vgpr-ratio",
326+
PromoteAllocaToVectorVGPRRatio);
327+
if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
328+
VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
329+
}
330+
313331
bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
314332
Mod = F.getParent();
315333
DL = &Mod->getDataLayout();
@@ -319,20 +337,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
319337
return false;
320338

321339
MaxVGPRs = getMaxVGPRs(TM, F);
340+
setFunctionLimits(F);
322341

323342
bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
324343

325-
const unsigned VGPRRatio =
326-
PromoteAllocaToVectorVGPRRatio.getNumOccurrences()
327-
? PromoteAllocaToVectorVGPRRatio
328-
: F.getFnAttributeAsParsedInteger(
329-
"amdgpu-promote-alloca-to-vector-vgpr-ratio",
330-
PromoteAllocaToVectorVGPRRatio);
331-
332344
unsigned VectorizationBudget =
333345
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
334346
: (MaxVGPRs * 32)) /
335-
VGPRRatio;
347+
VGPRBudgetRatio;
336348

337349
SmallVector<AllocaInst *, 16> Allocas;
338350
for (Instruction &I : F.getEntryBlock()) {
@@ -802,11 +814,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
802814
}
803815

804816
const unsigned MaxElements =
805-
PromoteAllocaToVectorMaxElements.getNumOccurrences()
806-
? PromoteAllocaToVectorMaxElements
807-
: Alloca.getParent()->getParent()->getFnAttributeAsParsedInteger(
808-
"amdgpu-promote-alloca-to-vector-max-elements",
809-
PromoteAllocaToVectorMaxElements);
817+
(MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
810818

811819
if (VectorTy->getNumElements() > MaxElements ||
812820
VectorTy->getNumElements() < 2) {

llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll renamed to llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
22
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s
3-
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-elements=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24
4-
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-elements=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32
3+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24
4+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32
55

66
define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
77
; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements(
@@ -232,5 +232,5 @@ declare i32 @llvm.amdgcn.workitem.id.y()
232232
declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg)
233233

234234
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" }
235-
attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-elements"="24" }
236-
attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-elements"="32" }
235+
attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="24" }
236+
attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="32" }

llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,5 +288,104 @@ define amdgpu_kernel void @i32_3d_load_store(ptr %out) {
288288
ret void
289289
}
290290

291+
define amdgpu_kernel void @i16_2d_load_store(ptr %out, i32 %sel) {
292+
; CHECK-LABEL: define amdgpu_kernel void @i16_2d_load_store(
293+
; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) {
294+
; CHECK-NEXT: [[TMP1:%.*]] = add i32 3, [[SEL]]
295+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5>, i32 [[TMP1]]
296+
; CHECK-NEXT: store i16 [[TMP2]], ptr [[OUT]], align 2
297+
; CHECK-NEXT: ret void
298+
;
299+
%alloca = alloca [2 x [3 x i16]], align 16, addrspace(5)
300+
%gep.00 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
301+
%gep.01 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
302+
%gep.02 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
303+
%gep.10 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
304+
%gep.11 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
305+
%gep.12 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
306+
store i16 0, ptr addrspace(5) %gep.00
307+
store i16 1, ptr addrspace(5) %gep.01
308+
store i16 2, ptr addrspace(5) %gep.02
309+
store i16 3, ptr addrspace(5) %gep.10
310+
store i16 4, ptr addrspace(5) %gep.11
311+
store i16 5, ptr addrspace(5) %gep.12
312+
%gep = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel
313+
%load = load i16, ptr addrspace(5) %gep
314+
store i16 %load, ptr %out
315+
ret void
316+
}
317+
318+
define amdgpu_kernel void @float_2d_load_store(ptr %out, i32 %sel) {
319+
; CHECK-LABEL: define amdgpu_kernel void @float_2d_load_store(
320+
; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) {
321+
; CHECK-NEXT: [[TMP1:%.*]] = add i32 3, [[SEL]]
322+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00>, i32 [[TMP1]]
323+
; CHECK-NEXT: store float [[TMP2]], ptr [[OUT]], align 4
324+
; CHECK-NEXT: ret void
325+
;
326+
%alloca = alloca [2 x [3 x float]], align 16, addrspace(5)
327+
%gep.00 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
328+
%gep.01 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
329+
%gep.02 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
330+
%gep.10 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
331+
%gep.11 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
332+
%gep.12 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
333+
store float 0.0, ptr addrspace(5) %gep.00
334+
store float 1.0, ptr addrspace(5) %gep.01
335+
store float 2.0, ptr addrspace(5) %gep.02
336+
store float 3.0, ptr addrspace(5) %gep.10
337+
store float 4.0, ptr addrspace(5) %gep.11
338+
store float 5.0, ptr addrspace(5) %gep.12
339+
%gep = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel
340+
%load = load float, ptr addrspace(5) %gep
341+
store float %load, ptr %out
342+
ret void
343+
}
344+
345+
define amdgpu_kernel void @ptr_2d_load_store(ptr %out, i32 %sel) {
346+
; CHECK-LABEL: define amdgpu_kernel void @ptr_2d_load_store(
347+
; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) {
348+
; CHECK-NEXT: [[PTR_0:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 0
349+
; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 1
350+
; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 2
351+
; CHECK-NEXT: [[PTR_3:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 3
352+
; CHECK-NEXT: [[PTR_4:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 4
353+
; CHECK-NEXT: [[PTR_5:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 5
354+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x ptr> undef, ptr [[PTR_0]], i32 0
355+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x ptr> [[TMP1]], ptr [[PTR_1]], i32 1
356+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <6 x ptr> [[TMP2]], ptr [[PTR_2]], i32 2
357+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x ptr> [[TMP3]], ptr [[PTR_3]], i32 3
358+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x ptr> [[TMP4]], ptr [[PTR_4]], i32 4
359+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x ptr> [[TMP5]], ptr [[PTR_5]], i32 5
360+
; CHECK-NEXT: [[TMP7:%.*]] = add i32 3, [[SEL]]
361+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x ptr> [[TMP6]], i32 [[TMP7]]
362+
; CHECK-NEXT: store ptr [[TMP8]], ptr [[OUT]], align 8
363+
; CHECK-NEXT: ret void
364+
;
365+
%alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5)
366+
%gep.00 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
367+
%gep.01 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
368+
%gep.02 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
369+
%gep.10 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
370+
%gep.11 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
371+
%gep.12 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
372+
%ptr.0 = getelementptr inbounds ptr, ptr %out, i32 0
373+
%ptr.1 = getelementptr inbounds ptr, ptr %out, i32 1
374+
%ptr.2 = getelementptr inbounds ptr, ptr %out, i32 2
375+
%ptr.3 = getelementptr inbounds ptr, ptr %out, i32 3
376+
%ptr.4 = getelementptr inbounds ptr, ptr %out, i32 4
377+
%ptr.5 = getelementptr inbounds ptr, ptr %out, i32 5
378+
store ptr %ptr.0, ptr addrspace(5) %gep.00
379+
store ptr %ptr.1, ptr addrspace(5) %gep.01
380+
store ptr %ptr.2, ptr addrspace(5) %gep.02
381+
store ptr %ptr.3, ptr addrspace(5) %gep.10
382+
store ptr %ptr.4, ptr addrspace(5) %gep.11
383+
store ptr %ptr.5, ptr addrspace(5) %gep.12
384+
%gep = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel
385+
%load = load ptr, ptr addrspace(5) %gep
386+
store ptr %load, ptr %out
387+
ret void
388+
}
389+
291390
declare i32 @llvm.amdgcn.workitem.id.x()
292391
declare i32 @llvm.amdgcn.workitem.id.y()

llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
2-
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 < %s | FileCheck %s
2+
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck %s
33

44
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
55

llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -269,8 +269,8 @@ declare i32 @llvm.amdgcn.workitem.id.x()
269269
declare i32 @llvm.amdgcn.workitem.id.y()
270270
declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg)
271271

272-
attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" }
273-
attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" }
274-
attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" }
272+
attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" }
273+
attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" }
274+
attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" }
275275
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
276276
; BASE: {{.*}}

llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
2-
; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
1+
; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-max-regs=64 < %s | FileCheck -check-prefix=OPT %s
2+
; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 -amdgpu-promote-alloca-to-vector-max-regs=64 < %s | FileCheck -check-prefix=LIMIT32 %s
33

44
target datalayout = "A5"
55

0 commit comments

Comments
 (0)