Skip to content

Commit 2a7c14d

Browse files
JanekvOtomtor
authored andcommitted
[AMDGPU] Flatten recursive register resource info propagation (llvm#142766)
In llvm#112251 I had mentioned I'd follow up with flattening of recursion for register resource info propagation Behaviour prior to this patch when a recursive call is used is to take the module scope worst case function register use (even prior to AMDGPUMCResourceInfo). With this patch it will, when a cycle is detected, attempt to do a simple cycle avoidant dfs to find the worst case constant within the cycle and the cycle's propagates. In other words, it will attempt to look for the cycle scope worst case rather than module scope worst case.
1 parent f6ebd7f commit 2a7c14d

File tree

4 files changed

+188
-32
lines changed

4 files changed

+188
-32
lines changed

llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp

Lines changed: 87 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,86 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
9797
return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
9898
}
9999

100+
// Tries to flatten recursive call register resource gathering. Simple cycle
101+
// avoiding dfs to find the constants in the propagated symbols.
102+
// Assumes:
103+
// - RecSym has been confirmed to recurse (this means the callee symbols should
104+
// all be populated, started at RecSym).
105+
// - Shape of the resource symbol's MCExpr (`max` args are order agnostic):
106+
// RecSym.MCExpr := max(<constant>+, <callee_symbol>*)
107+
const MCExpr *MCResourceInfo::flattenedCycleMax(MCSymbol *RecSym,
108+
ResourceInfoKind RIK,
109+
MCContext &OutContext) {
110+
SmallPtrSet<const MCExpr *, 8> Seen;
111+
SmallVector<const MCExpr *, 8> WorkList;
112+
int64_t Maximum = 0;
113+
114+
const MCExpr *RecExpr = RecSym->getVariableValue();
115+
WorkList.push_back(RecExpr);
116+
117+
while (!WorkList.empty()) {
118+
const MCExpr *CurExpr = WorkList.pop_back_val();
119+
switch (CurExpr->getKind()) {
120+
default: {
121+
// Assuming the recursion is of shape `max(<constant>, <callee_symbol>)`
122+
// where <callee_symbol> will eventually recurse. If this condition holds,
123+
// the recursion occurs within some other (possibly unresolvable) MCExpr,
124+
// thus using the worst case value then.
125+
if (!AMDGPUMCExpr::isSymbolUsedInExpression(RecSym, CurExpr)) {
126+
LLVM_DEBUG(dbgs() << "MCResUse: " << RecSym->getName()
127+
<< ": Recursion in unexpected sub-expression, using "
128+
"module maximum\n");
129+
switch (RIK) {
130+
default:
131+
break;
132+
case RIK_NumVGPR:
133+
return MCSymbolRefExpr::create(getMaxVGPRSymbol(OutContext),
134+
OutContext);
135+
break;
136+
case RIK_NumSGPR:
137+
return MCSymbolRefExpr::create(getMaxSGPRSymbol(OutContext),
138+
OutContext);
139+
break;
140+
case RIK_NumAGPR:
141+
return MCSymbolRefExpr::create(getMaxAGPRSymbol(OutContext),
142+
OutContext);
143+
break;
144+
}
145+
}
146+
break;
147+
}
148+
case MCExpr::ExprKind::Constant: {
149+
int64_t Val = cast<MCConstantExpr>(CurExpr)->getValue();
150+
Maximum = std::max(Maximum, Val);
151+
break;
152+
}
153+
case MCExpr::ExprKind::SymbolRef: {
154+
const MCSymbolRefExpr *SymExpr = cast<MCSymbolRefExpr>(CurExpr);
155+
const MCSymbol &SymRef = SymExpr->getSymbol();
156+
if (SymRef.isVariable()) {
157+
const MCExpr *SymVal = SymRef.getVariableValue();
158+
if (Seen.insert(SymVal).second)
159+
WorkList.push_back(SymVal);
160+
}
161+
break;
162+
}
163+
case MCExpr::ExprKind::Target: {
164+
const AMDGPUMCExpr *TargetExpr = cast<AMDGPUMCExpr>(CurExpr);
165+
if (TargetExpr->getKind() == AMDGPUMCExpr::VariantKind::AGVK_Max) {
166+
for (auto &Arg : TargetExpr->getArgs())
167+
WorkList.push_back(Arg);
168+
}
169+
break;
170+
}
171+
}
172+
}
173+
174+
LLVM_DEBUG(dbgs() << "MCResUse: " << RecSym->getName()
175+
<< ": Using flattened max: << " << Maximum << '\n');
176+
177+
return MCConstantExpr::create(Maximum, OutContext);
178+
}
179+
100180
void MCResourceInfo::assignResourceInfoExpr(
101181
int64_t LocalValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
102182
const MachineFunction &MF, const SmallVectorImpl<const Function *> &Callees,
@@ -133,25 +213,19 @@ void MCResourceInfo::assignResourceInfoExpr(
133213
<< CalleeValSym->getName() << " as callee\n");
134214
ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext));
135215
} else {
136-
LLVM_DEBUG(
137-
dbgs() << "MCResUse: " << Sym->getName()
138-
<< ": Recursion found, falling back to module maximum\n");
139-
// In case of recursion: make sure to use conservative register counts
140-
// (i.e., specifically for VGPR/SGPR/AGPR).
216+
LLVM_DEBUG(dbgs() << "MCResUse: " << Sym->getName()
217+
<< ": Recursion found, attempt flattening of cycle "
218+
"for resource usage\n");
219+
// In case of recursion for vgpr/sgpr/agpr resource usage: try to
220+
// flatten and use the max of the call cycle. May still end up emitting
221+
// module max if not fully resolvable.
141222
switch (RIK) {
142223
default:
143224
break;
144225
case RIK_NumVGPR:
145-
ArgExprs.push_back(MCSymbolRefExpr::create(
146-
getMaxVGPRSymbol(OutContext), OutContext));
147-
break;
148226
case RIK_NumSGPR:
149-
ArgExprs.push_back(MCSymbolRefExpr::create(
150-
getMaxSGPRSymbol(OutContext), OutContext));
151-
break;
152227
case RIK_NumAGPR:
153-
ArgExprs.push_back(MCSymbolRefExpr::create(
154-
getMaxAGPRSymbol(OutContext), OutContext));
228+
ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext));
155229
break;
156230
}
157231
}

llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ class MCResourceInfo {
5858
// Assigns expression for Max S/V/A-GPRs to the referenced symbols.
5959
void assignMaxRegs(MCContext &OutContext);
6060

61+
// Take flattened max of cyclic function calls' knowns. For example, for
62+
// a cycle A->B->C->D->A, take max(A, B, C, D) for A and have B, C, D have the
63+
// propgated value from A.
64+
const MCExpr *flattenedCycleMax(MCSymbol *RecSym, ResourceInfoKind RIK,
65+
MCContext &OutContext);
66+
6167
public:
6268
MCResourceInfo() = default;
6369
void addMaxVGPRCandidate(int32_t candidate) {

llvm/test/CodeGen/AMDGPU/function-resource-usage.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -495,17 +495,17 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
495495
; GCN: NumVgprs: max(43, multi_stage_recurse1.num_vgpr)
496496
; GCN: ScratchSize: 16+max(multi_stage_recurse1.private_seg_size)
497497
; GCN-LABEL: {{^}}multi_stage_recurse1:
498-
; GCN: .set multi_stage_recurse1.num_vgpr, max(48, amdgpu.max_num_vgpr)
499-
; GCN: .set multi_stage_recurse1.num_agpr, max(0, amdgpu.max_num_agpr)
500-
; GCN: .set multi_stage_recurse1.numbered_sgpr, max(34, amdgpu.max_num_sgpr)
498+
; GCN: .set multi_stage_recurse1.num_vgpr, max(48, 43)
499+
; GCN: .set multi_stage_recurse1.num_agpr, max(0, 0)
500+
; GCN: .set multi_stage_recurse1.numbered_sgpr, max(34, 34)
501501
; GCN: .set multi_stage_recurse1.private_seg_size, 16
502502
; GCN: .set multi_stage_recurse1.uses_vcc, 1
503503
; GCN: .set multi_stage_recurse1.uses_flat_scratch, 0
504504
; GCN: .set multi_stage_recurse1.has_dyn_sized_stack, 0
505505
; GCN: .set multi_stage_recurse1.has_recursion, 1
506506
; GCN: .set multi_stage_recurse1.has_indirect_call, 0
507-
; GCN: TotalNumSgprs: multi_stage_recurse1.numbered_sgpr+4
508-
; GCN: NumVgprs: max(48, amdgpu.max_num_vgpr)
507+
; GCN: TotalNumSgprs: 38
508+
; GCN: NumVgprs: 48
509509
; GCN: ScratchSize: 16
510510
define void @multi_stage_recurse1(i32 %val) #2 {
511511
call void @multi_stage_recurse2(i32 %val)
@@ -528,8 +528,8 @@ define void @multi_stage_recurse2(i32 %val) #2 {
528528
; GCN: .set usage_multi_stage_recurse.has_dyn_sized_stack, or(0, multi_stage_recurse1.has_dyn_sized_stack)
529529
; GCN: .set usage_multi_stage_recurse.has_recursion, or(1, multi_stage_recurse1.has_recursion)
530530
; GCN: .set usage_multi_stage_recurse.has_indirect_call, or(0, multi_stage_recurse1.has_indirect_call)
531-
; GCN: TotalNumSgprs: usage_multi_stage_recurse.numbered_sgpr+6
532-
; GCN: NumVgprs: usage_multi_stage_recurse.num_vgpr
531+
; GCN: TotalNumSgprs: 40
532+
; GCN: NumVgprs: 48
533533
; GCN: ScratchSize: 16
534534
define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 {
535535
call void @multi_stage_recurse1(i32 %n)
@@ -550,17 +550,17 @@ define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 {
550550
; GCN: NumVgprs: max(41, multi_stage_recurse_noattr1.num_vgpr)
551551
; GCN: ScratchSize: 16+max(multi_stage_recurse_noattr1.private_seg_size)
552552
; GCN-LABEL: {{^}}multi_stage_recurse_noattr1:
553-
; GCN: .set multi_stage_recurse_noattr1.num_vgpr, max(41, amdgpu.max_num_vgpr)
554-
; GCN: .set multi_stage_recurse_noattr1.num_agpr, max(0, amdgpu.max_num_agpr)
555-
; GCN: .set multi_stage_recurse_noattr1.numbered_sgpr, max(57, amdgpu.max_num_sgpr)
553+
; GCN: .set multi_stage_recurse_noattr1.num_vgpr, max(41, 41)
554+
; GCN: .set multi_stage_recurse_noattr1.num_agpr, max(0, 0)
555+
; GCN: .set multi_stage_recurse_noattr1.numbered_sgpr, max(57, 54)
556556
; GCN: .set multi_stage_recurse_noattr1.private_seg_size, 16
557557
; GCN: .set multi_stage_recurse_noattr1.uses_vcc, 1
558558
; GCN: .set multi_stage_recurse_noattr1.uses_flat_scratch, 0
559559
; GCN: .set multi_stage_recurse_noattr1.has_dyn_sized_stack, 0
560560
; GCN: .set multi_stage_recurse_noattr1.has_recursion, 0
561561
; GCN: .set multi_stage_recurse_noattr1.has_indirect_call, 0
562-
; GCN: TotalNumSgprs: multi_stage_recurse_noattr1.numbered_sgpr+4
563-
; GCN: NumVgprs: max(41, amdgpu.max_num_vgpr)
562+
; GCN: TotalNumSgprs: 61
563+
; GCN: NumVgprs: 41
564564
; GCN: ScratchSize: 16
565565
define void @multi_stage_recurse_noattr1(i32 %val) #0 {
566566
call void @multi_stage_recurse_noattr2(i32 %val)
@@ -583,8 +583,8 @@ define void @multi_stage_recurse_noattr2(i32 %val) #0 {
583583
; GCN: .set usage_multi_stage_recurse_noattrs.has_dyn_sized_stack, or(0, multi_stage_recurse_noattr1.has_dyn_sized_stack)
584584
; GCN: .set usage_multi_stage_recurse_noattrs.has_recursion, or(0, multi_stage_recurse_noattr1.has_recursion)
585585
; GCN: .set usage_multi_stage_recurse_noattrs.has_indirect_call, or(0, multi_stage_recurse_noattr1.has_indirect_call)
586-
; GCN: TotalNumSgprs: usage_multi_stage_recurse_noattrs.numbered_sgpr+6
587-
; GCN: NumVgprs: usage_multi_stage_recurse_noattrs.num_vgpr
586+
; GCN: TotalNumSgprs: 63
587+
; GCN: NumVgprs: 41
588588
; GCN: ScratchSize: 16
589589
define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 {
590590
call void @multi_stage_recurse_noattr1(i32 %n)
@@ -601,8 +601,8 @@ define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 {
601601
; GCN: .set multi_call_with_multi_stage_recurse.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack, multi_stage_recurse1.has_dyn_sized_stack)
602602
; GCN: .set multi_call_with_multi_stage_recurse.has_recursion, or(1, use_stack0.has_recursion, use_stack1.has_recursion, multi_stage_recurse1.has_recursion)
603603
; GCN: .set multi_call_with_multi_stage_recurse.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call, multi_stage_recurse1.has_indirect_call)
604-
; GCN: TotalNumSgprs: multi_call_with_multi_stage_recurse.numbered_sgpr+6
605-
; GCN: NumVgprs: multi_call_with_multi_stage_recurse.num_vgpr
604+
; GCN: TotalNumSgprs: 59
605+
; GCN: NumVgprs: 48
606606
; GCN: ScratchSize: 2052
607607
define amdgpu_kernel void @multi_call_with_multi_stage_recurse(i32 %n) #0 {
608608
call void @use_stack0()

llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll

Lines changed: 79 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck %s
22

3+
; Recursion: foo -> bar -> baz -> qux -> foo
4+
35
; CHECK-LABEL: {{^}}qux
46
; CHECK: .set qux.num_vgpr, max(71, foo.num_vgpr)
57
; CHECK: .set qux.num_agpr, max(0, foo.num_agpr)
@@ -34,9 +36,9 @@
3436
; CHECK: .set bar.has_indirect_call, or(0, baz.has_indirect_call)
3537

3638
; CHECK-LABEL: {{^}}foo
37-
; CHECK: .set foo.num_vgpr, max(46, amdgpu.max_num_vgpr)
38-
; CHECK: .set foo.num_agpr, max(0, amdgpu.max_num_agpr)
39-
; CHECK: .set foo.numbered_sgpr, max(71, amdgpu.max_num_sgpr)
39+
; CHECK: .set foo.num_vgpr, max(46, 71)
40+
; CHECK: .set foo.num_agpr, max(0, 0)
41+
; CHECK: .set foo.numbered_sgpr, max(71, 61)
4042
; CHECK: .set foo.private_seg_size, 16
4143
; CHECK: .set foo.uses_vcc, 1
4244
; CHECK: .set foo.uses_flat_scratch, 0
@@ -91,3 +93,77 @@ define amdgpu_kernel void @usefoo() {
9193
ret void
9294
}
9395

96+
; Recursion: A -> B -> C -> A && C -> D -> C
97+
98+
; CHECK-LABEL: {{^}}D
99+
; CHECK: .set D.num_vgpr, max(71, C.num_vgpr)
100+
; CHECK: .set D.num_agpr, max(0, C.num_agpr)
101+
; CHECK: .set D.numbered_sgpr, max(71, C.numbered_sgpr)
102+
; CHECK: .set D.private_seg_size, 16+max(C.private_seg_size)
103+
; CHECK: .set D.uses_vcc, or(1, C.uses_vcc)
104+
; CHECK: .set D.uses_flat_scratch, or(0, C.uses_flat_scratch)
105+
; CHECK: .set D.has_dyn_sized_stack, or(0, C.has_dyn_sized_stack)
106+
; CHECK: .set D.has_recursion, or(1, C.has_recursion)
107+
; CHECK: .set D.has_indirect_call, or(0, C.has_indirect_call)
108+
109+
; CHECK-LABEL: {{^}}C
110+
; CHECK: .set C.num_vgpr, max(42, A.num_vgpr, 71)
111+
; CHECK: .set C.num_agpr, max(0, A.num_agpr, 0)
112+
; CHECK: .set C.numbered_sgpr, max(71, A.numbered_sgpr, 71)
113+
; CHECK: .set C.private_seg_size, 16+max(A.private_seg_size)
114+
; CHECK: .set C.uses_vcc, or(1, A.uses_vcc)
115+
; CHECK: .set C.uses_flat_scratch, or(0, A.uses_flat_scratch)
116+
; CHECK: .set C.has_dyn_sized_stack, or(0, A.has_dyn_sized_stack)
117+
; CHECK: .set C.has_recursion, or(1, A.has_recursion)
118+
; CHECK: .set C.has_indirect_call, or(0, A.has_indirect_call)
119+
120+
; CHECK-LABEL: {{^}}B
121+
; CHECK: .set B.num_vgpr, max(42, C.num_vgpr)
122+
; CHECK: .set B.num_agpr, max(0, C.num_agpr)
123+
; CHECK: .set B.numbered_sgpr, max(71, C.numbered_sgpr)
124+
; CHECK: .set B.private_seg_size, 16+max(C.private_seg_size)
125+
; CHECK: .set B.uses_vcc, or(1, C.uses_vcc)
126+
; CHECK: .set B.uses_flat_scratch, or(0, C.uses_flat_scratch)
127+
; CHECK: .set B.has_dyn_sized_stack, or(0, C.has_dyn_sized_stack)
128+
; CHECK: .set B.has_recursion, or(1, C.has_recursion)
129+
; CHECK: .set B.has_indirect_call, or(0, C.has_indirect_call)
130+
131+
; CHECK-LABEL: {{^}}A
132+
; CHECK: .set A.num_vgpr, max(42, 71)
133+
; CHECK: .set A.num_agpr, max(0, 0)
134+
; CHECK: .set A.numbered_sgpr, max(71, 71)
135+
; CHECK: .set A.private_seg_size, 16
136+
; CHECK: .set A.uses_vcc, 1
137+
; CHECK: .set A.uses_flat_scratch, 0
138+
; CHECK: .set A.has_dyn_sized_stack, 0
139+
; CHECK: .set A.has_recursion, 1
140+
; CHECK: .set A.has_indirect_call, 0
141+
142+
define void @A() {
143+
call void @B()
144+
call void asm sideeffect "", "~{v10}"()
145+
call void asm sideeffect "", "~{s50}"()
146+
ret void
147+
}
148+
149+
define void @B() {
150+
call void @C()
151+
call void asm sideeffect "", "~{v20}"()
152+
call void asm sideeffect "", "~{s30}"()
153+
ret void
154+
}
155+
156+
define void @C() {
157+
call void @A()
158+
call void @D()
159+
call void asm sideeffect "", "~{v30}"()
160+
call void asm sideeffect "", "~{s40}"()
161+
ret void
162+
}
163+
164+
define void @D() {
165+
call void @C()
166+
call void asm sideeffect "", "~{v70}"()
167+
call void asm sideeffect "", "~{s70}"()
168+
ret void
169+
}

0 commit comments

Comments
 (0)