Skip to content

Commit 01ac917

Browse files
authored
Merge pull request llvm#308 from AMD-Lightning-Internal/amd/dev/rlieberm/CP-apply-124007
[AMDGPU] Restore SP from saved-FP or saved-BP (llvm#124007)
2 parents 2f1d47b + 23dd0e8 commit 01ac917

File tree

63 files changed

+1329
-1046
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+1329
-1046
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,6 +1512,16 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
15121512
const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
15131513
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
15141514
bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1515+
if (RoundedSize != 0) {
1516+
if (TRI.hasBasePointer(MF))
1517+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1518+
.addReg(TRI.getBaseRegister())
1519+
.setMIFlag(MachineInstr::FrameDestroy);
1520+
else if (hasFP(MF))
1521+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1522+
.addReg(FramePtrReg)
1523+
.setMIFlag(MachineInstr::FrameDestroy);
1524+
}
15151525

15161526
Register FramePtrRegScratchCopy;
15171527
Register SGPRForFPSaveRestoreCopy =
@@ -1537,14 +1547,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
15371547
FramePtrRegScratchCopy);
15381548
}
15391549

1540-
if (RoundedSize != 0 && hasFP(MF)) {
1541-
auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1542-
.addReg(StackPtrReg)
1543-
.addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1544-
.setMIFlag(MachineInstr::FrameDestroy);
1545-
Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1546-
}
1547-
15481550
// FIXME: Switch to using MF.needsFrameMoves() later
15491551
const bool NeedsFrameMoves = true;
15501552
if (hasFP(MF)) {

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -530,8 +530,7 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
530530
bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
531531
// When we need stack realignment, we can't reference off of the
532532
// stack pointer, so we reserve a base pointer.
533-
const MachineFrameInfo &MFI = MF.getFrameInfo();
534-
return MFI.getNumFixedObjects() && shouldRealignStack(MF);
533+
return shouldRealignStack(MF);
535534
}
536535

537536
Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }

llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ define ptr addrspace(1) @call_assert_align() {
2727
; CHECK-NEXT: global_store_dword v[0:1], v2, off
2828
; CHECK-NEXT: s_waitcnt vmcnt(0)
2929
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
30+
; CHECK-NEXT: s_mov_b32 s32, s33
3031
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
3132
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
3233
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
3334
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
34-
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
3535
; CHECK-NEXT: s_mov_b32 s33, s4
3636
; CHECK-NEXT: s_waitcnt vmcnt(0)
3737
; CHECK-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,11 +247,11 @@ define void @func_caller_stack() {
247247
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
248248
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
249249
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
250+
; MUBUF-NEXT: s_mov_b32 s32, s33
250251
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
251252
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
252253
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
253254
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
254-
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
255255
; MUBUF-NEXT: s_mov_b32 s33, s4
256256
; MUBUF-NEXT: s_waitcnt vmcnt(0)
257257
; MUBUF-NEXT: s_setpc_b64 s[30:31]
@@ -286,11 +286,11 @@ define void @func_caller_stack() {
286286
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
287287
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
288288
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
289+
; FLATSCR-NEXT: s_mov_b32 s32, s33
289290
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
290291
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
291292
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
292293
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
293-
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
294294
; FLATSCR-NEXT: s_mov_b32 s33, s0
295295
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
296296
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
@@ -372,11 +372,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
372372
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
373373
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
374374
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
375+
; MUBUF-NEXT: s_mov_b32 s32, s33
375376
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
376377
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
377378
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
378379
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
379-
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
380380
; MUBUF-NEXT: s_mov_b32 s33, s4
381381
; MUBUF-NEXT: s_waitcnt vmcnt(0)
382382
; MUBUF-NEXT: s_setpc_b64 s[30:31]
@@ -437,11 +437,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
437437
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
438438
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
439439
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
440+
; FLATSCR-NEXT: s_mov_b32 s32, s33
440441
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
441442
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
442443
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
443444
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
444-
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
445445
; FLATSCR-NEXT: s_mov_b32 s33, s0
446446
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
447447
; FLATSCR-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
8585
; GFX9-NEXT: s_and_b32 s4, s4, -16
8686
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
8787
; GFX9-NEXT: s_add_u32 s32, s6, s4
88-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
88+
; GFX9-NEXT: s_mov_b32 s32, s33
8989
; GFX9-NEXT: s_mov_b32 s33, s7
9090
; GFX9-NEXT: s_waitcnt vmcnt(0)
9191
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -111,7 +111,7 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
111111
; GFX10-NEXT: s_and_b32 s4, s4, -16
112112
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
113113
; GFX10-NEXT: s_add_u32 s32, s6, s4
114-
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
114+
; GFX10-NEXT: s_mov_b32 s32, s33
115115
; GFX10-NEXT: s_mov_b32 s33, s7
116116
; GFX10-NEXT: s_setpc_b64 s[30:31]
117117
;
@@ -135,9 +135,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
135135
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
136136
; GFX11-NEXT: s_and_b32 s0, s0, -16
137137
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
138-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
138+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
139139
; GFX11-NEXT: s_add_u32 s32, s2, s0
140-
; GFX11-NEXT: s_add_i32 s32, s32, -16
140+
; GFX11-NEXT: s_mov_b32 s32, s33
141141
; GFX11-NEXT: s_mov_b32 s33, s3
142142
; GFX11-NEXT: s_setpc_b64 s[30:31]
143143
%n = load i32, ptr addrspace(4) @gv, align 4
@@ -226,7 +226,7 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
226226
; GFX9-NEXT: s_and_b32 s4, s4, -16
227227
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
228228
; GFX9-NEXT: s_add_u32 s32, s6, s4
229-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
229+
; GFX9-NEXT: s_mov_b32 s32, s33
230230
; GFX9-NEXT: s_mov_b32 s33, s7
231231
; GFX9-NEXT: s_waitcnt vmcnt(0)
232232
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -252,7 +252,7 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
252252
; GFX10-NEXT: s_and_b32 s4, s4, -16
253253
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
254254
; GFX10-NEXT: s_add_u32 s32, s6, s4
255-
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
255+
; GFX10-NEXT: s_mov_b32 s32, s33
256256
; GFX10-NEXT: s_mov_b32 s33, s7
257257
; GFX10-NEXT: s_setpc_b64 s[30:31]
258258
;
@@ -276,9 +276,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
276276
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
277277
; GFX11-NEXT: s_and_b32 s0, s0, -16
278278
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
279-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
279+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
280280
; GFX11-NEXT: s_add_u32 s32, s2, s0
281-
; GFX11-NEXT: s_add_i32 s32, s32, -16
281+
; GFX11-NEXT: s_mov_b32 s32, s33
282282
; GFX11-NEXT: s_mov_b32 s33, s3
283283
; GFX11-NEXT: s_setpc_b64 s[30:31]
284284
%n = load i32, ptr addrspace(4) @gv, align 16
@@ -355,6 +355,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
355355
; GFX9-NEXT: s_mov_b32 s6, s33
356356
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
357357
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
358+
; GFX9-NEXT: s_mov_b32 s7, s34
359+
; GFX9-NEXT: s_mov_b32 s34, s32
358360
; GFX9-NEXT: s_addk_i32 s32, 0x1000
359361
; GFX9-NEXT: s_getpc_b64 s[4:5]
360362
; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
@@ -372,7 +374,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
372374
; GFX9-NEXT: s_and_b32 s4, s4, -16
373375
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
374376
; GFX9-NEXT: s_add_u32 s32, s5, s4
375-
; GFX9-NEXT: s_addk_i32 s32, 0xf000
377+
; GFX9-NEXT: s_mov_b32 s32, s34
378+
; GFX9-NEXT: s_mov_b32 s34, s7
376379
; GFX9-NEXT: s_mov_b32 s33, s6
377380
; GFX9-NEXT: s_waitcnt vmcnt(0)
378381
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -382,7 +385,9 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
382385
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383386
; GFX10-NEXT: s_mov_b32 s6, s33
384387
; GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
388+
; GFX10-NEXT: s_mov_b32 s7, s34
385389
; GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00
390+
; GFX10-NEXT: s_mov_b32 s34, s32
386391
; GFX10-NEXT: s_addk_i32 s32, 0x800
387392
; GFX10-NEXT: s_getpc_b64 s[4:5]
388393
; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
@@ -400,7 +405,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
400405
; GFX10-NEXT: s_and_b32 s4, s4, -16
401406
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
402407
; GFX10-NEXT: s_add_u32 s32, s5, s4
403-
; GFX10-NEXT: s_addk_i32 s32, 0xf800
408+
; GFX10-NEXT: s_mov_b32 s32, s34
409+
; GFX10-NEXT: s_mov_b32 s34, s7
404410
; GFX10-NEXT: s_mov_b32 s33, s6
405411
; GFX10-NEXT: s_setpc_b64 s[30:31]
406412
;
@@ -409,8 +415,9 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
409415
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410416
; GFX11-NEXT: s_mov_b32 s2, s33
411417
; GFX11-NEXT: s_add_i32 s33, s32, 31
412-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
418+
; GFX11-NEXT: s_mov_b32 s3, s34
413419
; GFX11-NEXT: s_and_not1_b32 s33, s33, 31
420+
; GFX11-NEXT: s_mov_b32 s34, s32
414421
; GFX11-NEXT: s_add_i32 s32, s32, 64
415422
; GFX11-NEXT: s_getpc_b64 s[0:1]
416423
; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4
@@ -429,8 +436,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
429436
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
430437
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
431438
; GFX11-NEXT: s_add_u32 s32, s1, s0
432-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
433-
; GFX11-NEXT: s_addk_i32 s32, 0xffc0
439+
; GFX11-NEXT: s_mov_b32 s32, s34
440+
; GFX11-NEXT: s_mov_b32 s34, s3
434441
; GFX11-NEXT: s_mov_b32 s33, s2
435442
; GFX11-NEXT: s_setpc_b64 s[30:31]
436443
%n = load i32, ptr addrspace(4) @gv

llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,11 +248,11 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) {
248248
; GFX9-NEXT: s_swappc_b64 s[30:31], 0
249249
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
250250
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
251+
; GFX9-NEXT: s_mov_b32 s32, s33
251252
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
252253
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
253254
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
254255
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
255-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
256256
; GFX9-NEXT: s_mov_b32 s33, s4
257257
; GFX9-NEXT: s_waitcnt vmcnt(0)
258258
; GFX9-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
180180
; GCN-NEXT: v_mov_b32_e32 v0, 0
181181
; GCN-NEXT: global_store_dword v[0:1], v0, off
182182
; GCN-NEXT: s_waitcnt vmcnt(0)
183-
; GCN-NEXT: s_addk_i32 s32, 0xfc00
183+
; GCN-NEXT: s_mov_b32 s32, s33
184184
; GCN-NEXT: s_mov_b32 s33, s7
185185
; GCN-NEXT: s_setpc_b64 s[30:31]
186186

@@ -216,7 +216,9 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
216216
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217217
; GCN-NEXT: s_mov_b32 s7, s33
218218
; GCN-NEXT: s_add_i32 s33, s32, 0xfc0
219+
; GCN-NEXT: s_mov_b32 s8, s34
219220
; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000
221+
; GCN-NEXT: s_mov_b32 s34, s32
220222
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
221223
; GCN-NEXT: s_addk_i32 s32, 0x2000
222224
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -242,7 +244,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
242244
; GCN-NEXT: v_mov_b32_e32 v0, 0
243245
; GCN-NEXT: global_store_dword v[0:1], v0, off
244246
; GCN-NEXT: s_waitcnt vmcnt(0)
245-
; GCN-NEXT: s_addk_i32 s32, 0xe000
247+
; GCN-NEXT: s_mov_b32 s32, s34
248+
; GCN-NEXT: s_mov_b32 s34, s8
246249
; GCN-NEXT: s_mov_b32 s33, s7
247250
; GCN-NEXT: s_setpc_b64 s[30:31]
248251
entry:

llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ define void @parent_func_missing_inputs() #0 {
3232
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
3333
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
3434
; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
35+
; FIXEDABI-NEXT: s_mov_b32 s32, s33
3536
; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2
3637
; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1
3738
; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
3839
; FIXEDABI-NEXT: s_mov_b64 exec, s[6:7]
39-
; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00
4040
; FIXEDABI-NEXT: s_mov_b32 s33, s4
4141
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
4242
; FIXEDABI-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,11 +193,11 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
193193
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
194194
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
195195
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
196+
; CHECK-NEXT: s_mov_b32 s32, s33
196197
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
197198
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
198199
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
199200
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
200-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
201201
; CHECK-NEXT: s_mov_b32 s33, s4
202202
; CHECK-NEXT: s_waitcnt vmcnt(0)
203203
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -329,11 +329,11 @@ define double @test_powr_fast_f64(double %x, double %y) {
329329
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
330330
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
331331
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
332+
; CHECK-NEXT: s_mov_b32 s32, s33
332333
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
333334
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
334335
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
335336
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
336-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
337337
; CHECK-NEXT: s_mov_b32 s33, s4
338338
; CHECK-NEXT: s_waitcnt vmcnt(0)
339339
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -477,11 +477,11 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
477477
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
478478
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
479479
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
480+
; CHECK-NEXT: s_mov_b32 s32, s33
480481
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
481482
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
482483
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
483484
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
484-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
485485
; CHECK-NEXT: s_mov_b32 s33, s4
486486
; CHECK-NEXT: s_waitcnt vmcnt(0)
487487
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -614,11 +614,11 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
614614
; CHECK-NEXT: v_readlane_b32 s36, v42, 2
615615
; CHECK-NEXT: v_readlane_b32 s35, v42, 1
616616
; CHECK-NEXT: v_readlane_b32 s34, v42, 0
617+
; CHECK-NEXT: s_mov_b32 s32, s33
617618
; CHECK-NEXT: v_readlane_b32 s4, v42, 14
618619
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
619620
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
620621
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
621-
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
622622
; CHECK-NEXT: s_mov_b32 s33, s4
623623
; CHECK-NEXT: s_waitcnt vmcnt(0)
624624
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -761,11 +761,11 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
761761
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
762762
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
763763
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
764+
; CHECK-NEXT: s_mov_b32 s32, s33
764765
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
765766
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
766767
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
767768
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
768-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
769769
; CHECK-NEXT: s_mov_b32 s33, s4
770770
; CHECK-NEXT: s_waitcnt vmcnt(0)
771771
; CHECK-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)