Skip to content

Commit 39830fe

Browse files
committed
[AMDGPU][PEI] Set up SP for chain functions
Initialize the SP to 0 in the prologue of functions with the `amdgpu_cs_chain` or `amdgpu_cs_chain_preserve` calling conventions, but only if they need one (i.e. if they contain calls to `amdgpu_gfx` functions or if they have stack objects). Also make sure we don't try to realign the stack (since 0 is aligned enough). Differential Revision: https://reviews.llvm.org/D156413
1 parent 24b11ba commit 39830fe

File tree

6 files changed

+239
-45
lines changed

6 files changed

+239
-45
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1090,6 +1090,20 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
10901090
// to determine the end of the prologue.
10911091
DebugLoc DL;
10921092

1093+
if (FuncInfo->isChainFunction()) {
1094+
// Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1095+
// are free to set one up if they need it.
1096+
// FIXME: We shouldn't need to set SP just for the stack objects (we should
1097+
// use 0 as an immediate offset instead).
1098+
bool UseSP = requiresStackPointerReference(MF) || MFI.hasStackObjects();
1099+
if (UseSP) {
1100+
assert(StackPtrReg != AMDGPU::SP_REG);
1101+
1102+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1103+
.addImm(0);
1104+
}
1105+
}
1106+
10931107
bool HasFP = false;
10941108
bool HasBP = false;
10951109
uint32_t NumBytes = MFI.getStackSize();
@@ -1808,11 +1822,16 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
18081822
// register. We may need to initialize the stack pointer depending on the frame
18091823
// properties, which logically overlaps many of the cases where an ordinary
18101824
// function would require an FP.
1825+
// Also used for chain functions. While not technically entry functions, chain
1826+
// functions may need to set up a stack pointer in some situations.
18111827
bool SIFrameLowering::requiresStackPointerReference(
18121828
const MachineFunction &MF) const {
1829+
bool IsChainFunction = MF.getInfo<SIMachineFunctionInfo>()->isChainFunction();
1830+
18131831
// Callable functions always require a stack pointer reference.
1814-
assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1815-
"only expected to call this for entry points");
1832+
assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
1833+
IsChainFunction) &&
1834+
"only expected to call this for entry points and chain functions");
18161835

18171836
const MachineFrameInfo &MFI = MF.getFrameInfo();
18181837

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -733,12 +733,12 @@ bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF,
733733

734734
bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
735735
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
736-
// On entry, the base address is 0, so it can't possibly need any more
737-
// alignment.
736+
// On entry or in chain functions, the base address is 0, so it can't possibly
737+
// need any more alignment.
738738

739739
// FIXME: Should be able to specify the entry frame alignment per calling
740740
// convention instead.
741-
if (Info->isEntryFunction())
741+
if (Info->isEntryFunction() || Info->isChainFunction())
742742
return false;
743743

744744
return TargetRegisterInfo::shouldRealignStack(MF);

llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll

Lines changed: 85 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_no_stack({ptr, i32, <4 x i32>} inre
2929
ret void
3030
}
3131

32-
; FIXME: Setup s32.
33-
3432
define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr, <4 x i32> %vgpr) {
3533
; GISEL-GFX11-LABEL: amdgpu_cs_chain_simple_call:
3634
; GISEL-GFX11: ; %bb.0:
@@ -41,7 +39,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
4139
; GISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
4240
; GISEL-GFX11-NEXT: s_mov_b32 s4, use@abs32@lo
4341
; GISEL-GFX11-NEXT: s_mov_b32 s5, use@abs32@hi
44-
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
42+
; GISEL-GFX11-NEXT: s_mov_b32 s32, 0
4543
; GISEL-GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
4644
; GISEL-GFX11-NEXT: s_endpgm
4745
;
@@ -60,6 +58,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
6058
; GISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo
6159
; GISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi
6260
; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51]
61+
; GISEL-GFX10-NEXT: s_mov_b32 s32, 0
6362
; GISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
6463
; GISEL-GFX10-NEXT: s_endpgm
6564
;
@@ -72,7 +71,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
7271
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
7372
; DAGISEL-GFX11-NEXT: s_mov_b32 s5, use@abs32@hi
7473
; DAGISEL-GFX11-NEXT: s_mov_b32 s4, use@abs32@lo
75-
; DAGISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
74+
; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0
7675
; DAGISEL-GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
7776
; DAGISEL-GFX11-NEXT: s_endpgm
7877
;
@@ -91,18 +90,19 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
9190
; DAGISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi
9291
; DAGISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo
9392
; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51]
93+
; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0
9494
; DAGISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
9595
; DAGISEL-GFX10-NEXT: s_endpgm
9696
call amdgpu_gfx void @use(<4 x i32> %sgpr, <4 x i32> %vgpr)
9797
ret void
9898
}
9999

100-
; FIXME: Setup s32.
101-
102100
define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24 x i32> %vgprs) {
103101
; GISEL-GFX11-LABEL: amdgpu_cs_chain_spill:
104102
; GISEL-GFX11: ; %bb.0:
105103
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
104+
; GISEL-GFX11-NEXT: s_mov_b32 s32, 0
105+
; GISEL-GFX11-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
106106
; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 4
107107
; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32
108108
; GISEL-GFX11-NEXT: scratch_store_b32 off, v17, s24
@@ -123,7 +123,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
123123
; GISEL-GFX11-NEXT: scratch_store_b32 off, v24, s24
124124
; GISEL-GFX11-NEXT: scratch_store_b32 off, v25, s25
125125
; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 40
126-
; GISEL-GFX11-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
127126
; GISEL-GFX11-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
128127
; GISEL-GFX11-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
129128
; GISEL-GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
@@ -171,6 +170,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
171170
; GISEL-GFX10-NEXT: v_mov_b32_e32 v37, v13
172171
; GISEL-GFX10-NEXT: v_mov_b32_e32 v38, v14
173172
; GISEL-GFX10-NEXT: v_mov_b32_e32 v39, v15
173+
; GISEL-GFX10-NEXT: s_mov_b32 s32, 0
174174
; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32
175175
; GISEL-GFX10-NEXT: buffer_store_dword v17, off, s[48:51], s32 offset:4
176176
; GISEL-GFX10-NEXT: buffer_store_dword v18, off, s[48:51], s32 offset:8
@@ -229,6 +229,8 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
229229
; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_spill:
230230
; DAGISEL-GFX11: ; %bb.0:
231231
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232+
; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0
233+
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
232234
; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 60
233235
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32
234236
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v31, s24
@@ -249,7 +251,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
249251
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v24, s24
250252
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v23, s25
251253
; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 24
252-
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
253254
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12
254255
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v36, v11 :: v_dual_mov_b32 v37, v10
255256
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v38, v9 :: v_dual_mov_b32 v39, v8
@@ -297,6 +298,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
297298
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v37, v10
298299
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v38, v9
299300
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v39, v8
301+
; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0
300302
; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32
301303
; DAGISEL-GFX10-NEXT: buffer_store_dword v17, off, s[48:51], s32 offset:4
302304
; DAGISEL-GFX10-NEXT: buffer_store_dword v18, off, s[48:51], s32 offset:8
@@ -796,6 +798,81 @@ define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i
796798
unreachable
797799
}
798800

801+
define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
802+
; GISEL-GFX11-LABEL: amdgpu_cs_chain_dont_realign_stack:
803+
; GISEL-GFX11: ; %bb.0:
804+
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
805+
; GISEL-GFX11-NEXT: s_mov_b32 s3, 4
806+
; GISEL-GFX11-NEXT: s_mov_b32 s2, 3
807+
; GISEL-GFX11-NEXT: s_mov_b32 s1, 2
808+
; GISEL-GFX11-NEXT: s_mov_b32 s0, 1
809+
; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8
810+
; GISEL-GFX11-NEXT: s_mov_b32 s32, 0
811+
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
812+
; GISEL-GFX11-NEXT: v_add_nc_u32_e32 v4, s32, v0
813+
; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
814+
; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
815+
; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
816+
; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
817+
; GISEL-GFX11-NEXT: s_endpgm
818+
;
819+
; GISEL-GFX10-LABEL: amdgpu_cs_chain_dont_realign_stack:
820+
; GISEL-GFX10: ; %bb.0:
821+
; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
822+
; GISEL-GFX10-NEXT: s_mov_b32 s32, 0
823+
; GISEL-GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v8
824+
; GISEL-GFX10-NEXT: v_lshrrev_b32_e64 v2, 5, s32
825+
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 1
826+
; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, 3
827+
; GISEL-GFX10-NEXT: v_mov_b32_e32 v4, 4
828+
; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
829+
; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, 2
830+
; GISEL-GFX10-NEXT: buffer_store_dword v1, v0, s[48:51], 0 offen
831+
; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
832+
; GISEL-GFX10-NEXT: buffer_store_dword v2, v0, s[48:51], 0 offen offset:4
833+
; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
834+
; GISEL-GFX10-NEXT: buffer_store_dword v3, v0, s[48:51], 0 offen offset:8
835+
; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
836+
; GISEL-GFX10-NEXT: buffer_store_dword v4, v0, s[48:51], 0 offen offset:12
837+
; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
838+
; GISEL-GFX10-NEXT: s_endpgm
839+
;
840+
; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_dont_realign_stack:
841+
; DAGISEL-GFX11: ; %bb.0:
842+
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
843+
; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0
844+
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
845+
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
846+
; DAGISEL-GFX11-NEXT: v_lshl_add_u32 v4, v8, 4, s32
847+
; DAGISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
848+
; DAGISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
849+
; DAGISEL-GFX11-NEXT: s_endpgm
850+
;
851+
; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_dont_realign_stack:
852+
; DAGISEL-GFX10: ; %bb.0:
853+
; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
854+
; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0
855+
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 4
856+
; DAGISEL-GFX10-NEXT: v_lshrrev_b32_e64 v2, 5, s32
857+
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, 2
858+
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v4, 1
859+
; DAGISEL-GFX10-NEXT: v_lshl_add_u32 v1, v8, 4, v2
860+
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, 3
861+
; DAGISEL-GFX10-NEXT: buffer_store_dword v0, v1, s[48:51], 0 offen offset:12
862+
; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
863+
; DAGISEL-GFX10-NEXT: buffer_store_dword v2, v1, s[48:51], 0 offen offset:8
864+
; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
865+
; DAGISEL-GFX10-NEXT: buffer_store_dword v3, v1, s[48:51], 0 offen offset:4
866+
; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
867+
; DAGISEL-GFX10-NEXT: buffer_store_dword v4, v1, s[48:51], 0 offen
868+
; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
869+
; DAGISEL-GFX10-NEXT: s_endpgm
870+
%alloca.align32 = alloca [8 x <4 x i32>], align 32, addrspace(5)
871+
%gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align32, i32 0, i32 %idx
872+
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %gep0, align 32
873+
ret void
874+
}
875+
799876
declare void @llvm.amdgcn.cs.chain.v2i32(ptr, i32, <2 x i32>, <2 x i32>, i32, ...)
800877
declare void @llvm.amdgcn.cs.chain.v3i32(ptr, i32, <3 x i32>, <3 x i32>, i32, ...)
801878
declare void @llvm.amdgcn.cs.chain.v4i32(ptr, i32, <4 x i32>, <4 x i32>, i32, ...)

0 commit comments

Comments
 (0)