Skip to content

Commit d42c7b2

Browse files
committed
AMDGPU: Account for the size of LDS globals used through constant
expressions. Also "fix" the longstanding bug where the computed size depends on the order of the visitation. We could try to predict the allocation order used by legalization, but it would never be 100% perfect. Until we start fixing the addresses somehow (or have a more reliable allocation scheme later), just try to compute the size based on the worst case padding.
1 parent bbc2dde commit d42c7b2

File tree

3 files changed

+205
-35
lines changed

3 files changed

+205
-35
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 62 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -749,41 +749,79 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
749749
if (LocalMemLimit == 0)
750750
return false;
751751

752-
const DataLayout &DL = Mod->getDataLayout();
752+
SmallVector<const Constant *, 16> Stack;
753+
SmallPtrSet<const Constant *, 8> VisitedConstants;
754+
SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
755+
756+
auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
757+
for (const User *U : Val->users()) {
758+
if (const Instruction *Use = dyn_cast<Instruction>(U)) {
759+
if (Use->getParent()->getParent() == &F)
760+
return true;
761+
} else {
762+
const Constant *C = cast<Constant>(U);
763+
if (VisitedConstants.insert(C).second)
764+
Stack.push_back(C);
765+
}
766+
}
767+
768+
return false;
769+
};
753770

754-
// Check how much local memory is being used by global objects
755-
CurrentLocalMemUsage = 0;
756771
for (GlobalVariable &GV : Mod->globals()) {
757772
if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
758773
continue;
759774

760-
for (const User *U : GV.users()) {
761-
const Instruction *Use = dyn_cast<Instruction>(U);
762-
if (!Use) {
763-
// FIXME: This is probably a constant expression use. We should
764-
// recursively search the users of it for the parent function instead of
765-
// bailing.
766-
LLVM_DEBUG(dbgs() << "Giving up on LDS size estimate "
767-
"due to constant expression\n");
768-
return false;
769-
}
775+
if (visitUsers(&GV, &GV)) {
776+
UsedLDS.insert(&GV);
777+
Stack.clear();
778+
continue;
779+
}
770780

771-
if (Use->getParent()->getParent() == &F) {
772-
Align Alignment =
773-
DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
774-
775-
// FIXME: Try to account for padding here. The padding is currently
776-
// determined from the inverse order of uses in the function. I'm not
777-
// sure if the use list order is in any way connected to this, so the
778-
// total reported size is likely incorrect.
779-
uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
780-
CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment);
781-
CurrentLocalMemUsage += AllocSize;
781+
// For any ConstantExpr uses, we need to recursively search the users until
782+
// we see a function.
783+
while (!Stack.empty()) {
784+
const Constant *C = Stack.pop_back_val();
785+
if (visitUsers(&GV, C)) {
786+
UsedLDS.insert(&GV);
787+
Stack.clear();
782788
break;
783789
}
784790
}
785791
}
786792

793+
const DataLayout &DL = Mod->getDataLayout();
794+
SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
795+
AllocatedSizes.reserve(UsedLDS.size());
796+
797+
for (const GlobalVariable *GV : UsedLDS) {
798+
Align Alignment =
799+
DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
800+
uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
801+
AllocatedSizes.emplace_back(AllocSize, Alignment);
802+
}
803+
804+
// Sort to try to estimate the worst case alignment padding
805+
//
806+
// FIXME: We should really do something to fix the addresses to a more optimal
807+
// value instead
808+
llvm::sort(AllocatedSizes.begin(), AllocatedSizes.end(),
809+
[](std::pair<uint64_t, Align> LHS, std::pair<uint64_t, Align> RHS) {
810+
return LHS.second < RHS.second;
811+
});
812+
813+
// Check how much local memory is being used by global objects
814+
CurrentLocalMemUsage = 0;
815+
816+
// FIXME: Try to account for padding here. The real padding and address is
817+
// currently determined from the inverse order of uses in the function when
818+
// legalizing, which could also potentially change. We try to estimate the
819+
// worst case here, but we probably should fix the addresses earlier.
820+
for (auto Alloc : AllocatedSizes) {
821+
CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
822+
CurrentLocalMemUsage += Alloc.first;
823+
}
824+
787825
unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
788826
F);
789827

llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s
22

3-
; This shows that the amount of LDS estimate is sensitive to the order
4-
; of the LDS globals.
3+
; This shows that the amount LDS size estimate should try to not be
4+
; sensitive to the order of the LDS globals. This should try to
5+
; estimate the worst case padding behavior to avoid overallocating
6+
; LDS.
57

6-
; Both of these functions use the same amount of LDS, but the total
7-
; changes depending on the visit order of first use.
8+
; These functions use the same amount of LDS, but the total, final
9+
; size changes depending on the visit order of first use.
810

911
; The one with the suboptimal order resulting in extra padding exceeds
1012
; the desired limit
@@ -29,7 +31,7 @@
2931

3032

3133
; GCN-LABEL: {{^}}promote_alloca_size_order_0:
32-
; GCN: workgroup_group_segment_byte_size = 2340
34+
; GCN: workgroup_group_segment_byte_size = 1060
3335
define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
3436
entry:
3537
%stack = alloca [5 x i32], align 4, addrspace(5)
@@ -61,7 +63,7 @@ entry:
6163
}
6264

6365
; GCN-LABEL: {{^}}promote_alloca_size_order_1:
64-
; GCN: workgroup_group_segment_byte_size = 2352
66+
; GCN: workgroup_group_segment_byte_size = 1072
6567
define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
6668
entry:
6769
%stack = alloca [5 x i32], align 4, addrspace(5)

llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll

Lines changed: 135 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,20 @@
44
target datalayout = "A5"
55

66
@all_lds = internal unnamed_addr addrspace(3) global [16384 x i32] undef, align 4
7+
@some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
8+
9+
@initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4
10+
@initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4
711

812
; This function cannot promote to using LDS because of the size of the
913
; constant expression use in the function, which was previously not
1014
; detected.
11-
; IR-LABEL: @constant_expression_uses_lds(
15+
; IR-LABEL: @constant_expression_uses_all_lds(
1216
; IR: alloca
1317

14-
; ASM-LABEL: constant_expression_uses_lds:
15-
; ASM: .group_segment_fixed_size: 65536
16-
define amdgpu_kernel void @constant_expression_uses_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
18+
; ASM-LABEL: constant_expression_uses_all_lds:
19+
; ASM: .amdhsa_group_segment_fixed_size 65536
20+
define amdgpu_kernel void @constant_expression_uses_all_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
1721
entry:
1822
%stack = alloca [4 x i32], align 4, addrspace(5)
1923
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
@@ -32,4 +36,130 @@ entry:
3236
ret void
3337
}
3438

35-
attributes #0 = { "amdgpu-waves-per-eu"="1,5" }
39+
; Has a constant expression use through a single level of constant
40+
; expression, but not enough LDS to block promotion
41+
42+
; IR-LABEL: @constant_expression_uses_some_lds(
43+
; IR-NOT: alloca
44+
45+
; ASM-LABEL: {{^}}constant_expression_uses_some_lds:
46+
; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
47+
define amdgpu_kernel void @constant_expression_uses_some_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
48+
entry:
49+
%stack = alloca [4 x i32], align 4, addrspace(5)
50+
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
51+
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
52+
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
53+
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
54+
store i32 9, i32 addrspace(5)* %gep0
55+
store i32 10, i32 addrspace(5)* %gep1
56+
store i32 99, i32 addrspace(5)* %gep2
57+
store i32 43, i32 addrspace(5)* %gep3
58+
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
59+
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
60+
store i32 %load, i32 addrspace(1)* %out
61+
store volatile i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), i32 addrspace(1)* undef
62+
ret void
63+
}
64+
65+
declare void @callee(i8*)
66+
67+
; IR-LABEL: @constant_expression_uses_all_lds_multi_level(
68+
; IR: alloca
69+
70+
; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level:
71+
; ASM: .amdhsa_group_segment_fixed_size 65536{{$}}
72+
define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
73+
entry:
74+
%stack = alloca [4 x i32], align 4, addrspace(5)
75+
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
76+
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
77+
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
78+
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
79+
store i32 9, i32 addrspace(5)* %gep0
80+
store i32 10, i32 addrspace(5)* %gep1
81+
store i32 99, i32 addrspace(5)* %gep2
82+
store i32 43, i32 addrspace(5)* %gep3
83+
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
84+
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
85+
store i32 %load, i32 addrspace(1)* %out
86+
call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([16384 x i32], [16384 x i32] addrspace(3)* @all_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
87+
ret void
88+
}
89+
90+
; IR-LABEL: @constant_expression_uses_some_lds_multi_level(
91+
; IR-NOT: alloca
92+
; IR: llvm.amdgcn.workitem.id
93+
94+
; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level:
95+
; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
96+
define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
97+
entry:
98+
%stack = alloca [4 x i32], align 4, addrspace(5)
99+
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
100+
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
101+
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
102+
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
103+
store i32 9, i32 addrspace(5)* %gep0
104+
store i32 10, i32 addrspace(5)* %gep1
105+
store i32 99, i32 addrspace(5)* %gep2
106+
store i32 43, i32 addrspace(5)* %gep3
107+
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
108+
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
109+
store i32 %load, i32 addrspace(1)* %out
110+
call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @some_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
111+
ret void
112+
}
113+
114+
; IR-LABEL: @constant_expression_uses_some_lds_global_initializer(
115+
; IR-NOT: alloca
116+
; IR: llvm.amdgcn.workitem.id
117+
118+
; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer:
119+
; ASM: .amdhsa_group_segment_fixed_size 4096{{$}}
120+
define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
121+
entry:
122+
%stack = alloca [4 x i32], align 4, addrspace(5)
123+
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
124+
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
125+
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
126+
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
127+
store i32 9, i32 addrspace(5)* %gep0
128+
store i32 10, i32 addrspace(5)* %gep1
129+
store i32 99, i32 addrspace(5)* %gep2
130+
store i32 43, i32 addrspace(5)* %gep3
131+
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
132+
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
133+
store i32 %load, i32 addrspace(1)* %out
134+
135+
store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_some to i32), i32 addrspace(1)* undef
136+
ret void
137+
}
138+
139+
; We can't actually handle LDS initializers in global initializers,
140+
; but this should count as usage.
141+
142+
; IR-LABEL: @constant_expression_uses_all_lds_global_initializer(
143+
; IR: alloca
144+
145+
; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer:
146+
; ASM: .group_segment_fixed_size: 65536
147+
define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
148+
entry:
149+
%stack = alloca [4 x i32], align 4, addrspace(5)
150+
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
151+
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
152+
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
153+
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
154+
store i32 9, i32 addrspace(5)* %gep0
155+
store i32 10, i32 addrspace(5)* %gep1
156+
store i32 99, i32 addrspace(5)* %gep2
157+
store i32 43, i32 addrspace(5)* %gep3
158+
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
159+
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
160+
store i32 %load, i32 addrspace(1)* %out
161+
store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_all to i32), i32 addrspace(1)* undef
162+
ret void
163+
}
164+
165+
attributes #0 = { "amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="256,256" }

0 commit comments

Comments
 (0)