Skip to content

Commit 012e23c

Browse files
easyonaaditpravinjagtaparsenm
committed
[AMDGPU] Restore SP from saved-FP or saved-BP (llvm#124007)
Currently, the AMDGPU backend bumps the Stack Pointer by fixed size offsets in the prolog of device functions, and restores it by the same amount in the epilog. Prolog: sp += frameSize Epilog: sp -= frameSize If a function has dynamic stack realignment, Prolog: sp += frameSize + max_alignment Epilog: sp -= frameSize + max_alignment These calculations are not optimal in case of dynamic stack realignment, and completely fail in case of dynamic stack readjustment. This patch uses the saved Frame Pointer to restore SP. Prolog: fp = sp sp += frameSize Epilog: sp = fp In case of dynamic stack realignment, SP is restored from the saved Base Pointer. Prolog: fp = sp + (max_alignment - 1) fp = fp & (-max_alignment) bp = sp sp += frameSize + max_alignment Epilog: sp = bp (Note: The presence of BP has been enforced in case of any dynamic stack realignment.) --------- Co-authored-by: Pravin Jagtap <[email protected]> Co-authored-by: Matt Arsenault <[email protected]>
1 parent 5343ae7 commit 012e23c

File tree

66 files changed

+1427
-1101
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+1427
-1101
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,6 +1507,18 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
15071507
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
15081508
bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
15091509

1510+
if (RoundedSize != 0) {
1511+
if (TRI.hasBasePointer(MF)) {
1512+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1513+
.addReg(TRI.getBaseRegister())
1514+
.setMIFlag(MachineInstr::FrameDestroy);
1515+
} else if (hasFP(MF)) {
1516+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1517+
.addReg(FramePtrReg)
1518+
.setMIFlag(MachineInstr::FrameDestroy);
1519+
}
1520+
}
1521+
15101522
Register FramePtrRegScratchCopy;
15111523
Register SGPRForFPSaveRestoreCopy =
15121524
FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
@@ -1531,14 +1543,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
15311543
FramePtrRegScratchCopy);
15321544
}
15331545

1534-
if (RoundedSize != 0 && hasFP(MF)) {
1535-
auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1536-
.addReg(StackPtrReg)
1537-
.addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1538-
.setMIFlag(MachineInstr::FrameDestroy);
1539-
Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1540-
}
1541-
15421546
// FIXME: Switch to using MF.needsFrameMoves() later
15431547
const bool NeedsFrameMoves = true;
15441548
if (hasFP(MF)) {

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -521,8 +521,7 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
521521
bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
522522
// When we need stack realignment, we can't reference off of the
523523
// stack pointer, so we reserve a base pointer.
524-
const MachineFrameInfo &MFI = MF.getFrameInfo();
525-
return MFI.getNumFixedObjects() && shouldRealignStack(MF);
524+
return shouldRealignStack(MF);
526525
}
527526

528527
Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }

llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ define ptr addrspace(1) @call_assert_align() {
2727
; CHECK-NEXT: global_store_dword v[0:1], v2, off
2828
; CHECK-NEXT: s_waitcnt vmcnt(0)
2929
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
30+
; CHECK-NEXT: s_mov_b32 s32, s33
3031
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
3132
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
3233
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
3334
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
34-
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
3535
; CHECK-NEXT: s_mov_b32 s33, s4
3636
; CHECK-NEXT: s_waitcnt vmcnt(0)
3737
; CHECK-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,11 +247,11 @@ define void @func_caller_stack() {
247247
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
248248
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
249249
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
250+
; MUBUF-NEXT: s_mov_b32 s32, s33
250251
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
251252
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
252253
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
253254
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
254-
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
255255
; MUBUF-NEXT: s_mov_b32 s33, s4
256256
; MUBUF-NEXT: s_waitcnt vmcnt(0)
257257
; MUBUF-NEXT: s_setpc_b64 s[30:31]
@@ -286,11 +286,11 @@ define void @func_caller_stack() {
286286
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
287287
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
288288
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
289+
; FLATSCR-NEXT: s_mov_b32 s32, s33
289290
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
290291
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
291292
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
292293
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
293-
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
294294
; FLATSCR-NEXT: s_mov_b32 s33, s0
295295
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
296296
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
@@ -372,11 +372,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
372372
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
373373
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
374374
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
375+
; MUBUF-NEXT: s_mov_b32 s32, s33
375376
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
376377
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
377378
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
378379
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
379-
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
380380
; MUBUF-NEXT: s_mov_b32 s33, s4
381381
; MUBUF-NEXT: s_waitcnt vmcnt(0)
382382
; MUBUF-NEXT: s_setpc_b64 s[30:31]
@@ -437,11 +437,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
437437
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
438438
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
439439
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
440+
; FLATSCR-NEXT: s_mov_b32 s32, s33
440441
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
441442
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
442443
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
443444
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
444-
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
445445
; FLATSCR-NEXT: s_mov_b32 s33, s0
446446
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
447447
; FLATSCR-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
8585
; GFX9-NEXT: s_and_b32 s4, s4, -16
8686
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
8787
; GFX9-NEXT: s_add_u32 s32, s6, s4
88-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
88+
; GFX9-NEXT: s_mov_b32 s32, s33
8989
; GFX9-NEXT: s_mov_b32 s33, s7
9090
; GFX9-NEXT: s_waitcnt vmcnt(0)
9191
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -111,7 +111,7 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
111111
; GFX10-NEXT: s_and_b32 s4, s4, -16
112112
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
113113
; GFX10-NEXT: s_add_u32 s32, s6, s4
114-
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
114+
; GFX10-NEXT: s_mov_b32 s32, s33
115115
; GFX10-NEXT: s_mov_b32 s33, s7
116116
; GFX10-NEXT: s_setpc_b64 s[30:31]
117117
;
@@ -135,9 +135,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
135135
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
136136
; GFX11-NEXT: s_and_b32 s0, s0, -16
137137
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
138-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
138+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
139139
; GFX11-NEXT: s_add_u32 s32, s2, s0
140-
; GFX11-NEXT: s_add_i32 s32, s32, -16
140+
; GFX11-NEXT: s_mov_b32 s32, s33
141141
; GFX11-NEXT: s_mov_b32 s33, s3
142142
; GFX11-NEXT: s_setpc_b64 s[30:31]
143143
%n = load i32, ptr addrspace(4) @gv, align 4
@@ -226,7 +226,7 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
226226
; GFX9-NEXT: s_and_b32 s4, s4, -16
227227
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
228228
; GFX9-NEXT: s_add_u32 s32, s6, s4
229-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
229+
; GFX9-NEXT: s_mov_b32 s32, s33
230230
; GFX9-NEXT: s_mov_b32 s33, s7
231231
; GFX9-NEXT: s_waitcnt vmcnt(0)
232232
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -252,7 +252,7 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
252252
; GFX10-NEXT: s_and_b32 s4, s4, -16
253253
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
254254
; GFX10-NEXT: s_add_u32 s32, s6, s4
255-
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
255+
; GFX10-NEXT: s_mov_b32 s32, s33
256256
; GFX10-NEXT: s_mov_b32 s33, s7
257257
; GFX10-NEXT: s_setpc_b64 s[30:31]
258258
;
@@ -276,9 +276,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
276276
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
277277
; GFX11-NEXT: s_and_b32 s0, s0, -16
278278
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
279-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
279+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
280280
; GFX11-NEXT: s_add_u32 s32, s2, s0
281-
; GFX11-NEXT: s_add_i32 s32, s32, -16
281+
; GFX11-NEXT: s_mov_b32 s32, s33
282282
; GFX11-NEXT: s_mov_b32 s33, s3
283283
; GFX11-NEXT: s_setpc_b64 s[30:31]
284284
%n = load i32, ptr addrspace(4) @gv, align 16
@@ -355,6 +355,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
355355
; GFX9-NEXT: s_mov_b32 s6, s33
356356
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
357357
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
358+
; GFX9-NEXT: s_mov_b32 s7, s34
359+
; GFX9-NEXT: s_mov_b32 s34, s32
358360
; GFX9-NEXT: s_addk_i32 s32, 0x1000
359361
; GFX9-NEXT: s_getpc_b64 s[4:5]
360362
; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
@@ -372,7 +374,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
372374
; GFX9-NEXT: s_and_b32 s4, s4, -16
373375
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
374376
; GFX9-NEXT: s_add_u32 s32, s5, s4
375-
; GFX9-NEXT: s_addk_i32 s32, 0xf000
377+
; GFX9-NEXT: s_mov_b32 s32, s34
378+
; GFX9-NEXT: s_mov_b32 s34, s7
376379
; GFX9-NEXT: s_mov_b32 s33, s6
377380
; GFX9-NEXT: s_waitcnt vmcnt(0)
378381
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -382,7 +385,9 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
382385
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383386
; GFX10-NEXT: s_mov_b32 s6, s33
384387
; GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
388+
; GFX10-NEXT: s_mov_b32 s7, s34
385389
; GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00
390+
; GFX10-NEXT: s_mov_b32 s34, s32
386391
; GFX10-NEXT: s_addk_i32 s32, 0x800
387392
; GFX10-NEXT: s_getpc_b64 s[4:5]
388393
; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
@@ -400,7 +405,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
400405
; GFX10-NEXT: s_and_b32 s4, s4, -16
401406
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
402407
; GFX10-NEXT: s_add_u32 s32, s5, s4
403-
; GFX10-NEXT: s_addk_i32 s32, 0xf800
408+
; GFX10-NEXT: s_mov_b32 s32, s34
409+
; GFX10-NEXT: s_mov_b32 s34, s7
404410
; GFX10-NEXT: s_mov_b32 s33, s6
405411
; GFX10-NEXT: s_setpc_b64 s[30:31]
406412
;
@@ -409,8 +415,9 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
409415
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410416
; GFX11-NEXT: s_mov_b32 s2, s33
411417
; GFX11-NEXT: s_add_i32 s33, s32, 31
412-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
418+
; GFX11-NEXT: s_mov_b32 s3, s34
413419
; GFX11-NEXT: s_and_not1_b32 s33, s33, 31
420+
; GFX11-NEXT: s_mov_b32 s34, s32
414421
; GFX11-NEXT: s_add_i32 s32, s32, 64
415422
; GFX11-NEXT: s_getpc_b64 s[0:1]
416423
; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4
@@ -429,8 +436,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
429436
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
430437
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
431438
; GFX11-NEXT: s_add_u32 s32, s1, s0
432-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
433-
; GFX11-NEXT: s_addk_i32 s32, 0xffc0
439+
; GFX11-NEXT: s_mov_b32 s32, s34
440+
; GFX11-NEXT: s_mov_b32 s34, s3
434441
; GFX11-NEXT: s_mov_b32 s33, s2
435442
; GFX11-NEXT: s_setpc_b64 s[30:31]
436443
%n = load i32, ptr addrspace(4) @gv

llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,11 +248,11 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) {
248248
; GFX9-NEXT: s_swappc_b64 s[30:31], 0
249249
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
250250
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
251+
; GFX9-NEXT: s_mov_b32 s32, s33
251252
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
252253
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
253254
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
254255
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
255-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
256256
; GFX9-NEXT: s_mov_b32 s33, s4
257257
; GFX9-NEXT: s_waitcnt vmcnt(0)
258258
; GFX9-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
180180
; GCN-NEXT: v_mov_b32_e32 v0, 0
181181
; GCN-NEXT: global_store_dword v[0:1], v0, off
182182
; GCN-NEXT: s_waitcnt vmcnt(0)
183-
; GCN-NEXT: s_addk_i32 s32, 0xfc00
183+
; GCN-NEXT: s_mov_b32 s32, s33
184184
; GCN-NEXT: s_mov_b32 s33, s7
185185
; GCN-NEXT: s_setpc_b64 s[30:31]
186186

@@ -216,7 +216,9 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
216216
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217217
; GCN-NEXT: s_mov_b32 s7, s33
218218
; GCN-NEXT: s_add_i32 s33, s32, 0xfc0
219+
; GCN-NEXT: s_mov_b32 s8, s34
219220
; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000
221+
; GCN-NEXT: s_mov_b32 s34, s32
220222
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
221223
; GCN-NEXT: s_addk_i32 s32, 0x2000
222224
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -242,7 +244,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
242244
; GCN-NEXT: v_mov_b32_e32 v0, 0
243245
; GCN-NEXT: global_store_dword v[0:1], v0, off
244246
; GCN-NEXT: s_waitcnt vmcnt(0)
245-
; GCN-NEXT: s_addk_i32 s32, 0xe000
247+
; GCN-NEXT: s_mov_b32 s32, s34
248+
; GCN-NEXT: s_mov_b32 s34, s8
246249
; GCN-NEXT: s_mov_b32 s33, s7
247250
; GCN-NEXT: s_setpc_b64 s[30:31]
248251
entry:

llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ define void @parent_func_missing_inputs() #0 {
3232
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
3333
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
3434
; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
35+
; FIXEDABI-NEXT: s_mov_b32 s32, s33
3536
; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2
3637
; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1
3738
; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
3839
; FIXEDABI-NEXT: s_mov_b64 exec, s[6:7]
39-
; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00
4040
; FIXEDABI-NEXT: s_mov_b32 s33, s4
4141
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
4242
; FIXEDABI-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -192,11 +192,11 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
192192
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
193193
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
194194
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
195+
; CHECK-NEXT: s_mov_b32 s32, s33
195196
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
196197
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
197198
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
198199
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
199-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
200200
; CHECK-NEXT: s_mov_b32 s33, s4
201201
; CHECK-NEXT: s_waitcnt vmcnt(0)
202202
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -327,11 +327,11 @@ define double @test_powr_fast_f64(double %x, double %y) {
327327
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
328328
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
329329
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
330+
; CHECK-NEXT: s_mov_b32 s32, s33
330331
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
331332
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
332333
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
333334
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
334-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
335335
; CHECK-NEXT: s_mov_b32 s33, s4
336336
; CHECK-NEXT: s_waitcnt vmcnt(0)
337337
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -474,11 +474,11 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
474474
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
475475
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
476476
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
477+
; CHECK-NEXT: s_mov_b32 s32, s33
477478
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
478479
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
479480
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
480481
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
481-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
482482
; CHECK-NEXT: s_mov_b32 s33, s4
483483
; CHECK-NEXT: s_waitcnt vmcnt(0)
484484
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -610,11 +610,11 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
610610
; CHECK-NEXT: v_readlane_b32 s36, v42, 2
611611
; CHECK-NEXT: v_readlane_b32 s35, v42, 1
612612
; CHECK-NEXT: v_readlane_b32 s34, v42, 0
613+
; CHECK-NEXT: s_mov_b32 s32, s33
613614
; CHECK-NEXT: v_readlane_b32 s4, v42, 14
614615
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
615616
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
616617
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
617-
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
618618
; CHECK-NEXT: s_mov_b32 s33, s4
619619
; CHECK-NEXT: s_waitcnt vmcnt(0)
620620
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -756,11 +756,11 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
756756
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
757757
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
758758
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
759+
; CHECK-NEXT: s_mov_b32 s32, s33
759760
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
760761
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
761762
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
762763
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
763-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
764764
; CHECK-NEXT: s_mov_b32 s33, s4
765765
; CHECK-NEXT: s_waitcnt vmcnt(0)
766766
; CHECK-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -704,10 +704,10 @@ define void @callee_need_to_spill_fp_exec_to_memory() #2 {
704704
; WAVE64-NEXT: v_readlane_b32 s36, v39, 2
705705
; WAVE64-NEXT: v_readlane_b32 s35, v39, 1
706706
; WAVE64-NEXT: v_readlane_b32 s34, v39, 0
707+
; WAVE64-NEXT: s_mov_b32 s32, s33
707708
; WAVE64-NEXT: s_xor_saveexec_b64 s[4:5], -1
708709
; WAVE64-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:192 ; 4-byte Folded Reload
709710
; WAVE64-NEXT: s_mov_b64 exec, s[4:5]
710-
; WAVE64-NEXT: s_addk_i32 s32, 0xce00
711711
; WAVE64-NEXT: .cfi_def_cfa_register 64
712712
; WAVE64-NEXT: s_mov_b32 s33, s40
713713
; WAVE64-NEXT: s_waitcnt vmcnt(0)
@@ -1088,11 +1088,11 @@ define void @callee_need_to_spill_fp_exec_to_memory() #2 {
10881088
; WAVE32-NEXT: v_readlane_b32 s36, v39, 2
10891089
; WAVE32-NEXT: v_readlane_b32 s35, v39, 1
10901090
; WAVE32-NEXT: v_readlane_b32 s34, v39, 0
1091+
; WAVE32-NEXT: s_mov_b32 s32, s33
10911092
; WAVE32-NEXT: s_xor_saveexec_b32 s4, -1
10921093
; WAVE32-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:192 ; 4-byte Folded Reload
10931094
; WAVE32-NEXT: s_waitcnt_depctr 0xffe3
10941095
; WAVE32-NEXT: s_mov_b32 exec_lo, s4
1095-
; WAVE32-NEXT: s_addk_i32 s32, 0xe680
10961096
; WAVE32-NEXT: .cfi_def_cfa_register 64
10971097
; WAVE32-NEXT: s_mov_b32 s33, s40
10981098
; WAVE32-NEXT: s_waitcnt vmcnt(0)
@@ -1999,7 +1999,7 @@ define void @need_to_spill_pc_to_mem() #3 {
19991999
; WAVE64-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
20002000
; WAVE64-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload
20012001
; WAVE64-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
2002-
; WAVE64-NEXT: s_addk_i32 s32, 0x8800
2002+
; WAVE64-NEXT: s_mov_b32 s32, s33
20032003
; WAVE64-NEXT: .cfi_def_cfa_register 64
20042004
; WAVE64-NEXT: s_mov_b32 s33, s6
20052005
; WAVE64-NEXT: s_waitcnt vmcnt(0)
@@ -2531,7 +2531,7 @@ define void @need_to_spill_pc_to_mem() #3 {
25312531
; WAVE32-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436
25322532
; WAVE32-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440
25332533
; WAVE32-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444
2534-
; WAVE32-NEXT: s_addk_i32 s32, 0xc600
2534+
; WAVE32-NEXT: s_mov_b32 s32, s33
25352535
; WAVE32-NEXT: .cfi_def_cfa_register 64
25362536
; WAVE32-NEXT: s_waitcnt_depctr 0xffe3
25372537
; WAVE32-NEXT: s_mov_b32 s33, s6

0 commit comments

Comments
 (0)