Skip to content

Commit 11b0401

Browse files
easyonaaditpravinjagtaparsenm
authored
[AMDGPU] Restore SP from saved-FP or saved-BP (#124007)
Currently, the AMDGPU backend bumps the Stack Pointer by fixed size offsets in the prolog of device functions, and restores it by the same amount in the epilog. Prolog: sp += frameSize Epilog: sp -= frameSize If a function has dynamic stack realignment, Prolog: sp += frameSize + max_alignment Epilog: sp -= frameSize + max_alignment These calculations are not optimal in case of dynamic stack realignment, and completely fail in case of dynamic stack readjustment. This patch uses the saved Frame Pointer to restore SP. Prolog: fp = sp sp += frameSize Epilog: sp = fp In case of dynamic stack realignment, SP is restored from the saved Base Pointer. Prolog: fp = sp + (max_alignment - 1) fp = fp & (-max_alignment) bp = sp sp += frameSize + max_alignment Epilog: sp = bp (Note: The presence of BP has been enforced in case of any dynamic stack realignment.) --------- Co-authored-by: Pravin Jagtap <[email protected]> Co-authored-by: Matt Arsenault <[email protected]>
1 parent 02c6002 commit 11b0401

File tree

61 files changed

+1305
-1064
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+1305
-1064
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1256,6 +1256,18 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
12561256
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
12571257
bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
12581258

1259+
if (RoundedSize != 0) {
1260+
if (TRI.hasBasePointer(MF)) {
1261+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1262+
.addReg(TRI.getBaseRegister())
1263+
.setMIFlag(MachineInstr::FrameDestroy);
1264+
} else if (hasFP(MF)) {
1265+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1266+
.addReg(FramePtrReg)
1267+
.setMIFlag(MachineInstr::FrameDestroy);
1268+
}
1269+
}
1270+
12591271
Register FramePtrRegScratchCopy;
12601272
Register SGPRForFPSaveRestoreCopy =
12611273
FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
@@ -1280,14 +1292,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
12801292
FramePtrRegScratchCopy);
12811293
}
12821294

1283-
if (RoundedSize != 0 && hasFP(MF)) {
1284-
auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1285-
.addReg(StackPtrReg)
1286-
.addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1287-
.setMIFlag(MachineInstr::FrameDestroy);
1288-
Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1289-
}
1290-
12911295
if (FPSaved) {
12921296
// Insert the copy to restore FP.
12931297
Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -525,8 +525,7 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
525525
bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
526526
// When we need stack realignment, we can't reference off of the
527527
// stack pointer, so we reserve a base pointer.
528-
const MachineFrameInfo &MFI = MF.getFrameInfo();
529-
return MFI.getNumFixedObjects() && shouldRealignStack(MF);
528+
return shouldRealignStack(MF);
530529
}
531530

532531
Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }

llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ define ptr addrspace(1) @call_assert_align() {
2727
; CHECK-NEXT: s_waitcnt vmcnt(0)
2828
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
2929
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
30+
; CHECK-NEXT: s_mov_b32 s32, s33
3031
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
3132
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
3233
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
3334
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
34-
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
3535
; CHECK-NEXT: s_mov_b32 s33, s4
3636
; CHECK-NEXT: s_waitcnt vmcnt(0)
3737
; CHECK-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,11 +247,11 @@ define void @func_caller_stack() {
247247
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
248248
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
249249
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
250+
; MUBUF-NEXT: s_mov_b32 s32, s33
250251
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
251252
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
252253
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
253254
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
254-
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
255255
; MUBUF-NEXT: s_mov_b32 s33, s4
256256
; MUBUF-NEXT: s_waitcnt vmcnt(0)
257257
; MUBUF-NEXT: s_setpc_b64 s[30:31]
@@ -286,11 +286,11 @@ define void @func_caller_stack() {
286286
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
287287
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
288288
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
289+
; FLATSCR-NEXT: s_mov_b32 s32, s33
289290
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
290291
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
291292
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
292293
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
293-
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
294294
; FLATSCR-NEXT: s_mov_b32 s33, s0
295295
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
296296
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
@@ -372,11 +372,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
372372
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
373373
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
374374
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
375+
; MUBUF-NEXT: s_mov_b32 s32, s33
375376
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
376377
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
377378
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
378379
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
379-
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
380380
; MUBUF-NEXT: s_mov_b32 s33, s4
381381
; MUBUF-NEXT: s_waitcnt vmcnt(0)
382382
; MUBUF-NEXT: s_setpc_b64 s[30:31]
@@ -437,11 +437,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
437437
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
438438
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
439439
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
440+
; FLATSCR-NEXT: s_mov_b32 s32, s33
440441
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
441442
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
442443
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
443444
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
444-
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
445445
; FLATSCR-NEXT: s_mov_b32 s33, s0
446446
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
447447
; FLATSCR-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,13 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
8080
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
8181
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8282
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
83-
; GFX9-NEXT: s_mov_b32 s33, s7
8483
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8584
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
8685
; GFX9-NEXT: s_and_b32 s4, s4, -16
8786
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
8887
; GFX9-NEXT: s_add_u32 s32, s6, s4
89-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
88+
; GFX9-NEXT: s_mov_b32 s32, s33
89+
; GFX9-NEXT: s_mov_b32 s33, s7
9090
; GFX9-NEXT: s_waitcnt vmcnt(0)
9191
; GFX9-NEXT: s_setpc_b64 s[30:31]
9292
;
@@ -103,7 +103,6 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
103103
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
104104
; GFX10-NEXT: v_mov_b32_e32 v0, 0
105105
; GFX10-NEXT: v_mov_b32_e32 v1, s6
106-
; GFX10-NEXT: s_mov_b32 s33, s7
107106
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
108107
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
109108
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
@@ -112,7 +111,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
112111
; GFX10-NEXT: s_and_b32 s4, s4, -16
113112
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
114113
; GFX10-NEXT: s_add_u32 s32, s6, s4
115-
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
114+
; GFX10-NEXT: s_mov_b32 s32, s33
115+
; GFX10-NEXT: s_mov_b32 s33, s7
116116
; GFX10-NEXT: s_setpc_b64 s[30:31]
117117
;
118118
; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4:
@@ -127,7 +127,6 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
127127
; GFX11-NEXT: v_mov_b32_e32 v0, 0
128128
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
129129
; GFX11-NEXT: s_mov_b32 s2, s32
130-
; GFX11-NEXT: s_mov_b32 s33, s3
131130
; GFX11-NEXT: scratch_store_b32 off, v0, s2
132131
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
133132
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
@@ -136,9 +135,10 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
136135
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
137136
; GFX11-NEXT: s_and_b32 s0, s0, -16
138137
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
139-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
138+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
140139
; GFX11-NEXT: s_add_u32 s32, s2, s0
141-
; GFX11-NEXT: s_add_i32 s32, s32, -16
140+
; GFX11-NEXT: s_mov_b32 s32, s33
141+
; GFX11-NEXT: s_mov_b32 s33, s3
142142
; GFX11-NEXT: s_setpc_b64 s[30:31]
143143
%n = load i32, ptr addrspace(4) @gv, align 4
144144
%alloca = alloca i32, i32 %n, addrspace(5)
@@ -221,13 +221,13 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
221221
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
222222
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
223223
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
224-
; GFX9-NEXT: s_mov_b32 s33, s7
225224
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
226225
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
227226
; GFX9-NEXT: s_and_b32 s4, s4, -16
228227
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
229228
; GFX9-NEXT: s_add_u32 s32, s6, s4
230-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
229+
; GFX9-NEXT: s_mov_b32 s32, s33
230+
; GFX9-NEXT: s_mov_b32 s33, s7
231231
; GFX9-NEXT: s_waitcnt vmcnt(0)
232232
; GFX9-NEXT: s_setpc_b64 s[30:31]
233233
;
@@ -244,7 +244,6 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
244244
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
245245
; GFX10-NEXT: v_mov_b32_e32 v0, 0
246246
; GFX10-NEXT: v_mov_b32_e32 v1, s6
247-
; GFX10-NEXT: s_mov_b32 s33, s7
248247
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
249248
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
250249
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
@@ -253,7 +252,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
253252
; GFX10-NEXT: s_and_b32 s4, s4, -16
254253
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
255254
; GFX10-NEXT: s_add_u32 s32, s6, s4
256-
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
255+
; GFX10-NEXT: s_mov_b32 s32, s33
256+
; GFX10-NEXT: s_mov_b32 s33, s7
257257
; GFX10-NEXT: s_setpc_b64 s[30:31]
258258
;
259259
; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16:
@@ -268,7 +268,6 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
268268
; GFX11-NEXT: v_mov_b32_e32 v0, 0
269269
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
270270
; GFX11-NEXT: s_mov_b32 s2, s32
271-
; GFX11-NEXT: s_mov_b32 s33, s3
272271
; GFX11-NEXT: scratch_store_b32 off, v0, s2
273272
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
274273
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
@@ -277,9 +276,10 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
277276
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
278277
; GFX11-NEXT: s_and_b32 s0, s0, -16
279278
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
280-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
279+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
281280
; GFX11-NEXT: s_add_u32 s32, s2, s0
282-
; GFX11-NEXT: s_add_i32 s32, s32, -16
281+
; GFX11-NEXT: s_mov_b32 s32, s33
282+
; GFX11-NEXT: s_mov_b32 s33, s3
283283
; GFX11-NEXT: s_setpc_b64 s[30:31]
284284
%n = load i32, ptr addrspace(4) @gv, align 16
285285
%alloca = alloca i32, i32 %n, addrspace(5)
@@ -355,6 +355,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
355355
; GFX9-NEXT: s_mov_b32 s6, s33
356356
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
357357
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
358+
; GFX9-NEXT: s_mov_b32 s7, s34
359+
; GFX9-NEXT: s_mov_b32 s34, s32
358360
; GFX9-NEXT: s_addk_i32 s32, 0x1000
359361
; GFX9-NEXT: s_getpc_b64 s[4:5]
360362
; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
@@ -373,7 +375,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
373375
; GFX9-NEXT: s_and_b32 s4, s4, -16
374376
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
375377
; GFX9-NEXT: s_add_u32 s32, s5, s4
376-
; GFX9-NEXT: s_addk_i32 s32, 0xf000
378+
; GFX9-NEXT: s_mov_b32 s32, s34
379+
; GFX9-NEXT: s_mov_b32 s34, s7
377380
; GFX9-NEXT: s_waitcnt vmcnt(0)
378381
; GFX9-NEXT: s_setpc_b64 s[30:31]
379382
;
@@ -382,8 +385,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
382385
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383386
; GFX10-NEXT: s_mov_b32 s6, s33
384387
; GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
385-
; GFX10-NEXT: s_addk_i32 s32, 0x800
388+
; GFX10-NEXT: s_mov_b32 s7, s34
386389
; GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00
390+
; GFX10-NEXT: s_mov_b32 s34, s32
391+
; GFX10-NEXT: s_addk_i32 s32, 0x800
387392
; GFX10-NEXT: s_getpc_b64 s[4:5]
388393
; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
389394
; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
@@ -401,16 +406,19 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
401406
; GFX10-NEXT: s_and_b32 s4, s4, -16
402407
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
403408
; GFX10-NEXT: s_add_u32 s32, s5, s4
404-
; GFX10-NEXT: s_addk_i32 s32, 0xf800
409+
; GFX10-NEXT: s_mov_b32 s32, s34
410+
; GFX10-NEXT: s_mov_b32 s34, s7
405411
; GFX10-NEXT: s_setpc_b64 s[30:31]
406412
;
407413
; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32:
408414
; GFX11: ; %bb.0:
409415
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410416
; GFX11-NEXT: s_mov_b32 s2, s33
411417
; GFX11-NEXT: s_add_i32 s33, s32, 31
412-
; GFX11-NEXT: s_add_i32 s32, s32, 64
418+
; GFX11-NEXT: s_mov_b32 s3, s34
413419
; GFX11-NEXT: s_and_not1_b32 s33, s33, 31
420+
; GFX11-NEXT: s_mov_b32 s34, s32
421+
; GFX11-NEXT: s_add_i32 s32, s32, 64
414422
; GFX11-NEXT: s_getpc_b64 s[0:1]
415423
; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4
416424
; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
@@ -429,8 +437,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
429437
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
430438
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
431439
; GFX11-NEXT: s_add_u32 s32, s1, s0
432-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
433-
; GFX11-NEXT: s_addk_i32 s32, 0xffc0
440+
; GFX11-NEXT: s_mov_b32 s32, s34
441+
; GFX11-NEXT: s_mov_b32 s34, s3
434442
; GFX11-NEXT: s_setpc_b64 s[30:31]
435443
%n = load i32, ptr addrspace(4) @gv
436444
%alloca = alloca i32, i32 %n, align 32, addrspace(5)

llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,11 +248,11 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) {
248248
; GFX9-NEXT: s_swappc_b64 s[30:31], 0
249249
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
250250
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
251+
; GFX9-NEXT: s_mov_b32 s32, s33
251252
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
252253
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
253254
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
254255
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
255-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
256256
; GFX9-NEXT: s_mov_b32 s33, s4
257257
; GFX9-NEXT: s_waitcnt vmcnt(0)
258258
; GFX9-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
180180
; GCN-NEXT: v_mov_b32_e32 v0, 0
181181
; GCN-NEXT: global_store_dword v[0:1], v0, off
182182
; GCN-NEXT: s_waitcnt vmcnt(0)
183-
; GCN-NEXT: s_addk_i32 s32, 0xfc00
183+
; GCN-NEXT: s_mov_b32 s32, s33
184184
; GCN-NEXT: s_mov_b32 s33, s7
185185
; GCN-NEXT: s_setpc_b64 s[30:31]
186186

@@ -216,8 +216,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
216216
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217217
; GCN-NEXT: s_mov_b32 s7, s33
218218
; GCN-NEXT: s_add_i32 s33, s32, 0xfc0
219+
; GCN-NEXT: s_mov_b32 s8, s34
219220
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
220221
; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000
222+
; GCN-NEXT: s_mov_b32 s34, s32
221223
; GCN-NEXT: s_addk_i32 s32, 0x2000
222224
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
223225
; GCN-NEXT: s_cbranch_execz .LBB3_2
@@ -242,7 +244,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
242244
; GCN-NEXT: v_mov_b32_e32 v0, 0
243245
; GCN-NEXT: global_store_dword v[0:1], v0, off
244246
; GCN-NEXT: s_waitcnt vmcnt(0)
245-
; GCN-NEXT: s_addk_i32 s32, 0xe000
247+
; GCN-NEXT: s_mov_b32 s32, s34
248+
; GCN-NEXT: s_mov_b32 s34, s8
246249
; GCN-NEXT: s_mov_b32 s33, s7
247250
; GCN-NEXT: s_setpc_b64 s[30:31]
248251
entry:

llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ define void @parent_func_missing_inputs() #0 {
3232
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
3333
; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
3434
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
35+
; FIXEDABI-NEXT: s_mov_b32 s32, s33
3536
; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2
3637
; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1
3738
; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
3839
; FIXEDABI-NEXT: s_mov_b64 exec, s[6:7]
39-
; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00
4040
; FIXEDABI-NEXT: s_mov_b32 s33, s4
4141
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
4242
; FIXEDABI-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)