Skip to content

[AMDGPU] Restore SP from saved-FP or saved-BP #124007

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1256,6 +1256,18 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);

if (RoundedSize != 0) {
if (TRI.hasBasePointer(MF)) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
.addReg(TRI.getBaseRegister())
.setMIFlag(MachineInstr::FrameDestroy);
} else if (hasFP(MF)) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
.addReg(FramePtrReg)
.setMIFlag(MachineInstr::FrameDestroy);
}
}

Register FramePtrRegScratchCopy;
Register SGPRForFPSaveRestoreCopy =
FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
Expand All @@ -1280,14 +1292,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
FramePtrRegScratchCopy);
}

if (RoundedSize != 0 && hasFP(MF)) {
auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
.addReg(StackPtrReg)
.addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
.setMIFlag(MachineInstr::FrameDestroy);
Add->getOperand(3).setIsDead(); // Mark SCC as dead.
}

if (FPSaved) {
// Insert the copy to restore FP.
Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -525,8 +525,7 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
// When we need stack realignment, we can't reference off of the
// stack pointer, so we reserve a base pointer.
const MachineFrameInfo &MFI = MF.getFrameInfo();
return MFI.getNumFixedObjects() && shouldRealignStack(MF);
return shouldRealignStack(MF);
Copy link
Contributor

@arsenm arsenm Jan 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a fixme to undo this and we use too many base pointers

}

Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ define ptr addrspace(1) @call_assert_align() {
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,11 +247,11 @@ define void @func_caller_stack() {
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
; MUBUF-NEXT: s_mov_b32 s33, s4
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -286,11 +286,11 @@ define void @func_caller_stack() {
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
; FLATSCR-NEXT: s_mov_b32 s33, s0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -372,11 +372,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
; MUBUF-NEXT: s_mov_b32 s33, s4
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -437,11 +437,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
; FLATSCR-NEXT: s_mov_b32 s33, s0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
Expand Down
48 changes: 28 additions & 20 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,13 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-NEXT: s_mov_b32 s33, s7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-NEXT: s_and_b32 s4, s4, -16
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s32, s6, s4
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_mov_b32 s33, s7
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -103,7 +103,6 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s6
; GFX10-NEXT: s_mov_b32 s33, s7
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
Expand All @@ -112,7 +111,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
; GFX10-NEXT: s_add_u32 s32, s6, s4
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_mov_b32 s33, s7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4:
Expand All @@ -127,7 +127,6 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s2, s32
; GFX11-NEXT: s_mov_b32 s33, s3
; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
Expand All @@ -136,9 +135,10 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, -16
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s32, s2, s0
; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_mov_b32 s33, s3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%n = load i32, ptr addrspace(4) @gv, align 4
%alloca = alloca i32, i32 %n, addrspace(5)
Expand Down Expand Up @@ -221,13 +221,13 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-NEXT: s_mov_b32 s33, s7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-NEXT: s_and_b32 s4, s4, -16
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s32, s6, s4
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_mov_b32 s33, s7
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -244,7 +244,6 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s6
; GFX10-NEXT: s_mov_b32 s33, s7
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
Expand All @@ -253,7 +252,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
; GFX10-NEXT: s_add_u32 s32, s6, s4
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_mov_b32 s33, s7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16:
Expand All @@ -268,7 +268,6 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s2, s32
; GFX11-NEXT: s_mov_b32 s33, s3
; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
Expand All @@ -277,9 +276,10 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, -16
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s32, s2, s0
; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_mov_b32 s33, s3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%n = load i32, ptr addrspace(4) @gv, align 16
%alloca = alloca i32, i32 %n, addrspace(5)
Expand Down Expand Up @@ -355,6 +355,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX9-NEXT: s_mov_b32 s6, s33
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
; GFX9-NEXT: s_mov_b32 s7, s34
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_addk_i32 s32, 0x1000
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
Expand All @@ -373,7 +375,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX9-NEXT: s_and_b32 s4, s4, -16
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s32, s5, s4
; GFX9-NEXT: s_addk_i32 s32, 0xf000
; GFX9-NEXT: s_mov_b32 s32, s34
; GFX9-NEXT: s_mov_b32 s34, s7
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -382,8 +385,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s6, s33
; GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
; GFX10-NEXT: s_addk_i32 s32, 0x800
; GFX10-NEXT: s_mov_b32 s7, s34
; GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00
; GFX10-NEXT: s_mov_b32 s34, s32
; GFX10-NEXT: s_addk_i32 s32, 0x800
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
Expand All @@ -401,16 +406,19 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
; GFX10-NEXT: s_add_u32 s32, s5, s4
; GFX10-NEXT: s_addk_i32 s32, 0xf800
; GFX10-NEXT: s_mov_b32 s32, s34
; GFX10-NEXT: s_mov_b32 s34, s7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_add_i32 s33, s32, 31
; GFX11-NEXT: s_add_i32 s32, s32, 64
; GFX11-NEXT: s_mov_b32 s3, s34
; GFX11-NEXT: s_and_not1_b32 s33, s33, 31
; GFX11-NEXT: s_mov_b32 s34, s32
; GFX11-NEXT: s_add_i32 s32, s32, 64
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
Expand All @@ -429,8 +437,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-NEXT: s_add_u32 s32, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_addk_i32 s32, 0xffc0
; GFX11-NEXT: s_mov_b32 s32, s34
; GFX11-NEXT: s_mov_b32 s34, s3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%n = load i32, ptr addrspace(4) @gv
%alloca = alloca i32, i32 %n, align 32, addrspace(5)
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -248,11 +248,11 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) {
; GFX9-NEXT: s_swappc_b64 s[30:31], 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand Down
7 changes: 5 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_mov_b32 s33, s7
; GCN-NEXT: s_setpc_b64 s[30:31]

Expand Down Expand Up @@ -216,8 +216,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s7, s33
; GCN-NEXT: s_add_i32 s33, s32, 0xfc0
; GCN-NEXT: s_mov_b32 s8, s34
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000
; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: s_addk_i32 s32, 0x2000
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB3_2
Expand All @@ -242,7 +244,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_addk_i32 s32, 0xe000
; GCN-NEXT: s_mov_b32 s32, s34
; GCN-NEXT: s_mov_b32 s34, s8
; GCN-NEXT: s_mov_b32 s33, s7
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ define void @parent_func_missing_inputs() #0 {
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
; FIXEDABI-NEXT: s_mov_b32 s32, s33
; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2
; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1
; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; FIXEDABI-NEXT: s_mov_b64 exec, s[6:7]
; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00
; FIXEDABI-NEXT: s_mov_b32 s33, s4
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
; FIXEDABI-NEXT: s_setpc_b64 s[30:31]
Expand Down
Loading
Loading