Skip to content

Commit 814a0ab

Browse files
jacobdweightmanchichunchen
authored andcommitted
AMDGPU: allow reordering of functions in AMDGPUResourceUsageAnalysis
The AMDGPUResourceUsageAnalysis was previously a CGSCC pass, and assumed that a function's callees were always analyzed prior to their callees. When it was refactored into a module pass, this assumption no longer always holds. This results in calls being erroneously identified as indirect, and reserving private segment space for them. This results in significantly slower kernel launch latency. This patch changes the order in which the module's functions are analyzed from the order in which they occur in the module to a post-order traversal of the call graph. Perhaps Clang always generates the module's functions in such an order, but this is not the case for the Cray Fortran compiler. Reviewed By: #amdgpu, arsenm Differential Revision: https://reviews.llvm.org/D126025
1 parent 8da5d5d commit 814a0ab

File tree

2 files changed

+146
-4
lines changed

2 files changed

+146
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "AMDGPU.h"
2828
#include "GCNSubtarget.h"
2929
#include "SIMachineFunctionInfo.h"
30+
#include "llvm/ADT/PostOrderIterator.h"
3031
#include "llvm/Analysis/CallGraph.h"
3132
#include "llvm/CodeGen/MachineFrameInfo.h"
3233
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -105,15 +106,19 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
105106
const TargetMachine &TM = TPC->getTM<TargetMachine>();
106107
bool HasIndirectCall = false;
107108

108-
for (Function &F : M) {
109-
if (F.isDeclaration())
109+
CallGraph CG = CallGraph(M);
110+
auto End = po_end(&CG);
111+
112+
for (auto IT = po_begin(&CG); IT != End; ++IT) {
113+
Function *F = IT->getFunction();
114+
if (!F || F->isDeclaration())
110115
continue;
111116

112-
MachineFunction *MF = MMI.getMachineFunction(F);
117+
MachineFunction *MF = MMI.getMachineFunction(*F);
113118
assert(MF && "function must have been generated already");
114119

115120
auto CI = CallGraphResourceInfo.insert(
116-
std::make_pair(&F, SIFunctionResourceInfo()));
121+
std::make_pair(F, SIFunctionResourceInfo()));
117122
SIFunctionResourceInfo &Info = CI.first->second;
118123
assert(CI.second && "should only be called once per function");
119124
Info = analyzeResourceUsage(*MF, TM);
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
; Note: uses a randomly selected assumed external call stack size so that the
2+
; test assertions are unlikely to succeed by accident.
3+
4+
; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX7 %s
5+
; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx803 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX8 %s
6+
; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx900 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX9 %s
7+
; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx1010 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX10 %s
8+
9+
; CHECK-LABEL: amdhsa.kernels
10+
11+
; test a kernel without an external call that occurs before its callee in the module
12+
; CHECK-LABEL: test1
13+
; CHECK: .private_segment_fixed_size: 20
14+
15+
; GFX7: .sgpr_count: 37
16+
; GFX7: .sgpr_spill_count: 0
17+
; GFX7: .vgpr_count: 4
18+
; GFX7: .vgpr_spill_count: 0
19+
20+
; GFX8: .sgpr_count: 39
21+
; GFX8: .sgpr_spill_count: 0
22+
; GFX8: .vgpr_count: 4
23+
; GFX8: .vgpr_spill_count: 0
24+
25+
; GFX9: .sgpr_count: 39
26+
; GFX9: .sgpr_spill_count: 0
27+
; GFX9: .vgpr_count: 4
28+
; GFX9: .vgpr_spill_count: 0
29+
30+
; GFX10: .sgpr_count: 33
31+
; GFX10: .sgpr_spill_count: 0
32+
; GFX10: .vgpr_count: 4
33+
; GFX10: .vgpr_spill_count: 0
34+
define amdgpu_kernel void @test1(float* %x) {
35+
%1 = load volatile float, float* %x
36+
%2 = call float @f(float %1)
37+
store volatile float %2, float* %x
38+
ret void
39+
}
40+
41+
define internal float @f(float %arg0) #0 {
42+
%stack = alloca float, i32 4, align 4, addrspace(5)
43+
store volatile float 3.0, float addrspace(5)* %stack
44+
%val = load volatile float, float addrspace(5)* %stack
45+
%add = fadd float %arg0, %val
46+
ret float %add
47+
}
48+
49+
; test a kernel without an external call that occurs after its callee in the module
50+
; CHECK-LABEL: test2
51+
; CHECK: .private_segment_fixed_size: 20
52+
53+
; GFX7: .sgpr_count: 37
54+
; GFX7: .sgpr_spill_count: 0
55+
; GFX7: .vgpr_count: 4
56+
; GFX7: .vgpr_spill_count: 0
57+
58+
; GFX8: .sgpr_count: 39
59+
; GFX8: .sgpr_spill_count: 0
60+
; GFX8: .vgpr_count: 4
61+
; GFX8: .vgpr_spill_count: 0
62+
63+
; GFX9: .sgpr_count: 39
64+
; GFX9: .sgpr_spill_count: 0
65+
; GFX9: .vgpr_count: 4
66+
; GFX9: .vgpr_spill_count: 0
67+
68+
; GFX10: .sgpr_count: 33
69+
; GFX10: .sgpr_spill_count: 0
70+
; GFX10: .vgpr_count: 4
71+
; GFX10: .vgpr_spill_count: 0
72+
define amdgpu_kernel void @test2(float* %x) {
73+
%1 = load volatile float, float* %x
74+
%2 = call float @f(float %1)
75+
store volatile float %2, float* %x
76+
ret void
77+
}
78+
79+
; test a kernel with an external call that occurs before its callee in the module
80+
; CHECK-LABEL: test3
81+
; CHECK: .private_segment_fixed_size: 5310
82+
83+
; GFX7: .sgpr_count: 37
84+
; GFX7: .sgpr_spill_count: 0
85+
; GFX7: .vgpr_count: 32
86+
; GFX7: .vgpr_spill_count: 0
87+
88+
; GFX8: .sgpr_count: 39
89+
; GFX8: .sgpr_spill_count: 0
90+
; GFX8: .vgpr_count: 32
91+
; GFX8: .vgpr_spill_count: 0
92+
93+
; GFX9: .sgpr_count: 39
94+
; GFX9: .sgpr_spill_count: 0
95+
; GFX9: .vgpr_count: 32
96+
; GFX9: .vgpr_spill_count: 0
97+
98+
; GFX10: .sgpr_count: 35
99+
; GFX10: .sgpr_spill_count: 0
100+
; GFX10: .vgpr_count: 32
101+
; GFX10: .vgpr_spill_count: 0
102+
define amdgpu_kernel void @test3() {
103+
call void @g()
104+
ret void
105+
}
106+
107+
declare void @g() #0
108+
109+
; test a kernel without an external call that occurs after its callee in the module
110+
; CHECK-LABEL: test4
111+
; CHECK: .private_segment_fixed_size: 5310
112+
113+
; GFX7: .sgpr_count: 37
114+
; GFX7: .sgpr_spill_count: 0
115+
; GFX7: .vgpr_count: 32
116+
; GFX7: .vgpr_spill_count: 0
117+
118+
; GFX8: .sgpr_count: 39
119+
; GFX8: .sgpr_spill_count: 0
120+
; GFX8: .vgpr_count: 32
121+
; GFX8: .vgpr_spill_count: 0
122+
123+
; GFX9: .sgpr_count: 39
124+
; GFX9: .sgpr_spill_count: 0
125+
; GFX9: .vgpr_count: 32
126+
; GFX9: .vgpr_spill_count: 0
127+
128+
; GFX10: .sgpr_count: 35
129+
; GFX10: .sgpr_spill_count: 0
130+
; GFX10: .vgpr_count: 32
131+
; GFX10: .vgpr_spill_count: 0
132+
define amdgpu_kernel void @test4() {
133+
call void @g()
134+
ret void
135+
}
136+
137+
attributes #0 = { norecurse }

0 commit comments

Comments
 (0)