Skip to content

Commit af4f176

Browse files
authored
AMDGPU: Allocate special SGPRs before user SGPR arguments (#78234)
1 parent a348397 commit af4f176

9 files changed

+1953
-175
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,10 @@ bool AMDGPUCallLowering::lowerFormalArguments(
715715
if (!IsEntryFunc && !IsGraphics) {
716716
// For the fixed ABI, pass workitem IDs in the last argument register.
717717
TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
718+
719+
if (!Subtarget.enableFlatScratch())
720+
CCInfo.AllocateReg(Info->getScratchRSrcReg());
721+
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
718722
}
719723

720724
IncomingValueAssigner Assigner(AssignFn);
@@ -728,13 +732,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(
728732
uint64_t StackSize = Assigner.StackSize;
729733

730734
// Start adding system SGPRs.
731-
if (IsEntryFunc) {
735+
if (IsEntryFunc)
732736
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
733-
} else {
734-
if (!Subtarget.enableFlatScratch())
735-
CCInfo.AllocateReg(Info->getScratchRSrcReg());
736-
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
737-
}
738737

739738
// When we tail call, we need to check if the callee's arguments will fit on
740739
// the caller's stack. So, whenever we lower formal arguments, we should keep

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2783,15 +2783,16 @@ SDValue SITargetLowering::LowerFormalArguments(
27832783
} else if (!IsGraphics) {
27842784
// For the fixed ABI, pass workitem IDs in the last argument register.
27852785
allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2786+
2787+
// FIXME: Sink this into allocateSpecialInputSGPRs
2788+
if (!Subtarget->enableFlatScratch())
2789+
CCInfo.AllocateReg(Info->getScratchRSrcReg());
2790+
2791+
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
27862792
}
27872793

27882794
if (!IsKernel) {
27892795
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2790-
if (!IsGraphics && !Subtarget->enableFlatScratch()) {
2791-
CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
2792-
AMDGPU::SGPR2, AMDGPU::SGPR3},
2793-
4);
2794-
}
27952796
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
27962797
}
27972798

@@ -2991,13 +2992,8 @@ SDValue SITargetLowering::LowerFormalArguments(
29912992
}
29922993

29932994
// Start adding system SGPRs.
2994-
if (IsEntryFunc) {
2995+
if (IsEntryFunc)
29952996
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
2996-
} else {
2997-
CCInfo.AllocateReg(Info->getScratchRSrcReg());
2998-
if (!IsGraphics)
2999-
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3000-
}
30012997

30022998
auto &ArgUsageInfo =
30032999
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll

Lines changed: 43 additions & 43 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll

Lines changed: 86 additions & 86 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,13 +147,13 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
147147
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
148148
; CHECK-NEXT: v_mov_b32_e32 v41, v31
149149
; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7]
150+
; CHECK-NEXT: v_mov_b32_e32 v42, v2
150151
; CHECK-NEXT: s_mov_b32 s42, s15
151152
; CHECK-NEXT: s_mov_b32 s43, s14
152153
; CHECK-NEXT: s_mov_b32 s44, s13
153154
; CHECK-NEXT: s_mov_b32 s45, s12
154155
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
155156
; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9]
156-
; CHECK-NEXT: v_mov_b32_e32 v42, v2
157157
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
158158
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
159159
; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42
@@ -286,14 +286,14 @@ define double @test_powr_fast_f64(double %x, double %y) {
286286
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
287287
; CHECK-NEXT: v_mov_b32_e32 v43, v31
288288
; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7]
289+
; CHECK-NEXT: v_mov_b32_e32 v42, v3
290+
; CHECK-NEXT: v_mov_b32_e32 v41, v2
289291
; CHECK-NEXT: s_mov_b32 s42, s15
290292
; CHECK-NEXT: s_mov_b32 s43, s14
291293
; CHECK-NEXT: s_mov_b32 s44, s13
292294
; CHECK-NEXT: s_mov_b32 s45, s12
293295
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
294296
; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9]
295-
; CHECK-NEXT: v_mov_b32_e32 v42, v3
296-
; CHECK-NEXT: v_mov_b32_e32 v41, v2
297297
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
298298
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
299299
; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[41:42]
@@ -431,13 +431,13 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
431431
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
432432
; CHECK-NEXT: v_mov_b32_e32 v41, v31
433433
; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7]
434+
; CHECK-NEXT: v_mov_b32_e32 v42, v2
434435
; CHECK-NEXT: s_mov_b32 s42, s15
435436
; CHECK-NEXT: s_mov_b32 s43, s14
436437
; CHECK-NEXT: s_mov_b32 s44, s13
437438
; CHECK-NEXT: s_mov_b32 s45, s12
438439
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
439440
; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9]
440-
; CHECK-NEXT: v_mov_b32_e32 v42, v2
441441
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
442442
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
443443
; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42

0 commit comments

Comments
 (0)