Skip to content

Commit eb3c02f

Browse files
authored
[AMDGPU] Use immediates for stack accesses in chain funcs (#71913)
Switch to using immediate offsets instead of the SP register to access objects on the current stack frame in chain functions. This means we no longer need to reserve a SP register just for accesing stack objects and it also allows us to set the SP (when one is actually needed) to the stack size from the very beginning. This only works if we use a FixedObject for the ScavengeFI, which is what we do for entry functions anyway (and we generally want to keep chain functions close to amdgpu_cs behaviour where we don't have a good reason to diverge).
1 parent 9bc142a commit eb3c02f

File tree

7 files changed

+166
-144
lines changed

7 files changed

+166
-144
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,14 +1093,12 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
10931093
if (FuncInfo->isChainFunction()) {
10941094
// Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
10951095
// are free to set one up if they need it.
1096-
// FIXME: We shouldn't need to set SP just for the stack objects (we should
1097-
// use 0 as an immediate offset instead).
1098-
bool UseSP = requiresStackPointerReference(MF) || MFI.hasStackObjects();
1096+
bool UseSP = requiresStackPointerReference(MF);
10991097
if (UseSP) {
11001098
assert(StackPtrReg != AMDGPU::SP_REG);
11011099

11021100
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1103-
.addImm(0);
1101+
.addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
11041102
}
11051103
}
11061104

@@ -1115,7 +1113,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
11151113
Register FramePtrRegScratchCopy;
11161114
if (!HasFP && !hasFP(MF)) {
11171115
// Emit the CSR spill stores with SP base register.
1118-
emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
1116+
emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1117+
FuncInfo->isChainFunction() ? Register() : StackPtrReg,
11191118
FramePtrRegScratchCopy);
11201119
} else {
11211120
// CSR spill stores will use FP as base register.
@@ -1799,10 +1798,11 @@ static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
17991798
bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
18001799
const MachineFrameInfo &MFI = MF.getFrameInfo();
18011800

1802-
// For entry functions we can use an immediate offset in most cases, so the
1803-
// presence of calls doesn't imply we need a distinct frame pointer.
1801+
// For entry & chain functions we can use an immediate offset in most cases,
1802+
// so the presence of calls doesn't imply we need a distinct frame pointer.
18041803
if (MFI.hasCalls() &&
1805-
!MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1804+
!MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1805+
!MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
18061806
// All offsets are unsigned, so need to be addressed in the same direction
18071807
// as stack growth.
18081808

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
519519
const SIRegisterInfo &TRI) {
520520
if (ScavengeFI)
521521
return *ScavengeFI;
522-
if (isEntryFunction()) {
522+
if (isEntryFunction() || isChainFunction()) {
523523
ScavengeFI = MFI.CreateFixedObject(
524524
TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
525525
} else {

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -499,11 +499,11 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
499499
Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
500500
const SIFrameLowering *TFI = ST.getFrameLowering();
501501
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
502-
// During ISel lowering we always reserve the stack pointer in entry
502+
// During ISel lowering we always reserve the stack pointer in entry and chain
503503
// functions, but never actually want to reference it when accessing our own
504504
// frame. If we need a frame pointer we use it, but otherwise we can just use
505505
// an immediate "0" which we represent by returning NoRegister.
506-
if (FuncInfo->isEntryFunction()) {
506+
if (FuncInfo->isEntryFunction() || FuncInfo->isChainFunction()) {
507507
return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
508508
}
509509
return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
@@ -1649,7 +1649,7 @@ void SIRegisterInfo::buildSpillLoadStore(
16491649
if (UseVGPROffset && ScratchOffsetReg) {
16501650
MIB.addReg(ScratchOffsetReg);
16511651
} else {
1652-
assert(FuncInfo->isEntryFunction());
1652+
assert(FuncInfo->isEntryFunction() || FuncInfo->isChainFunction());
16531653
MIB.addImm(0);
16541654
}
16551655
}
@@ -2424,7 +2424,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24242424

24252425
bool IsMUBUF = TII->isMUBUF(*MI);
24262426

2427-
if (!IsMUBUF && !MFI->isEntryFunction()) {
2427+
if (!IsMUBUF && !MFI->isEntryFunction() && !MFI->isChainFunction()) {
24282428
// Convert to a swizzled stack address by scaling by the wave size.
24292429
// In an entry function/kernel the offset is already swizzled.
24302430
bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));

llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll

Lines changed: 66 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,65 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
357357
ret void
358358
}
359359

360+
define amdgpu_cs_chain void @alloca_and_call() {
361+
; GISEL-GFX11-LABEL: alloca_and_call:
362+
; GISEL-GFX11: ; %bb.0: ; %.entry
363+
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364+
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 42
365+
; GISEL-GFX11-NEXT: s_mov_b32 s0, use@abs32@lo
366+
; GISEL-GFX11-NEXT: s_mov_b32 s1, use@abs32@hi
367+
; GISEL-GFX11-NEXT: s_mov_b32 s32, 16
368+
; GISEL-GFX11-NEXT: scratch_store_b32 off, v0, off offset:4
369+
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 4
370+
; GISEL-GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
371+
; GISEL-GFX11-NEXT: s_endpgm
372+
;
373+
; GISEL-GFX10-LABEL: alloca_and_call:
374+
; GISEL-GFX10: ; %bb.0: ; %.entry
375+
; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376+
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 42
377+
; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
378+
; GISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo
379+
; GISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi
380+
; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51]
381+
; GISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:4
382+
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 4
383+
; GISEL-GFX10-NEXT: s_movk_i32 s32, 0x200
384+
; GISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
385+
; GISEL-GFX10-NEXT: s_endpgm
386+
;
387+
; DAGISEL-GFX11-LABEL: alloca_and_call:
388+
; DAGISEL-GFX11: ; %bb.0: ; %.entry
389+
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
390+
; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v0, 42
391+
; DAGISEL-GFX11-NEXT: s_mov_b32 s1, use@abs32@hi
392+
; DAGISEL-GFX11-NEXT: s_mov_b32 s0, use@abs32@lo
393+
; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 16
394+
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v0, off offset:4
395+
; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v0, 4
396+
; DAGISEL-GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
397+
; DAGISEL-GFX11-NEXT: s_endpgm
398+
;
399+
; DAGISEL-GFX10-LABEL: alloca_and_call:
400+
; DAGISEL-GFX10: ; %bb.0: ; %.entry
401+
; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402+
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 42
403+
; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
404+
; DAGISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi
405+
; DAGISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo
406+
; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51]
407+
; DAGISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:4
408+
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 4
409+
; DAGISEL-GFX10-NEXT: s_movk_i32 s32, 0x200
410+
; DAGISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
411+
; DAGISEL-GFX10-NEXT: s_endpgm
412+
.entry:
413+
%v = alloca [3 x i32], addrspace(5)
414+
store i32 42, ptr addrspace(5) %v
415+
call amdgpu_gfx void @use(ptr addrspace(5) %v)
416+
ret void
417+
}
418+
360419
define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
361420
; GISEL-GFX11-LABEL: cs_to_chain:
362421
; GISEL-GFX11: ; %bb.0:
@@ -807,9 +866,8 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
807866
; GISEL-GFX11-NEXT: s_mov_b32 s1, 2
808867
; GISEL-GFX11-NEXT: s_mov_b32 s0, 1
809868
; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8
810-
; GISEL-GFX11-NEXT: s_mov_b32 s32, 0
811-
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
812-
; GISEL-GFX11-NEXT: v_add_nc_u32_e32 v4, s32, v0
869+
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
870+
; GISEL-GFX11-NEXT: v_add_nc_u32_e32 v4, 32, v0
813871
; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
814872
; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
815873
; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
@@ -819,14 +877,12 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
819877
; GISEL-GFX10-LABEL: amdgpu_cs_chain_dont_realign_stack:
820878
; GISEL-GFX10: ; %bb.0:
821879
; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
822-
; GISEL-GFX10-NEXT: s_mov_b32 s32, 0
823880
; GISEL-GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v8
824-
; GISEL-GFX10-NEXT: v_lshrrev_b32_e64 v2, 5, s32
825881
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 1
882+
; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, 2
826883
; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, 3
827884
; GISEL-GFX10-NEXT: v_mov_b32_e32 v4, 4
828-
; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
829-
; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, 2
885+
; GISEL-GFX10-NEXT: v_add_nc_u32_e32 v0, 32, v0
830886
; GISEL-GFX10-NEXT: buffer_store_dword v1, v0, s[48:51], 0 offen
831887
; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
832888
; GISEL-GFX10-NEXT: buffer_store_dword v2, v0, s[48:51], 0 offen offset:4
@@ -840,24 +896,21 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
840896
; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_dont_realign_stack:
841897
; DAGISEL-GFX11: ; %bb.0:
842898
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
843-
; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0
844899
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
845900
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
846-
; DAGISEL-GFX11-NEXT: v_lshl_add_u32 v4, v8, 4, s32
901+
; DAGISEL-GFX11-NEXT: v_lshl_add_u32 v4, v8, 4, 32
847902
; DAGISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
848903
; DAGISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
849904
; DAGISEL-GFX11-NEXT: s_endpgm
850905
;
851906
; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_dont_realign_stack:
852907
; DAGISEL-GFX10: ; %bb.0:
853908
; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
854-
; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0
855909
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 4
856-
; DAGISEL-GFX10-NEXT: v_lshrrev_b32_e64 v2, 5, s32
910+
; DAGISEL-GFX10-NEXT: v_lshl_add_u32 v1, v8, 4, 32
911+
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, 3
857912
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, 2
858913
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v4, 1
859-
; DAGISEL-GFX10-NEXT: v_lshl_add_u32 v1, v8, 4, v2
860-
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, 3
861914
; DAGISEL-GFX10-NEXT: buffer_store_dword v0, v1, s[48:51], 0 offen offset:12
862915
; DAGISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
863916
; DAGISEL-GFX10-NEXT: buffer_store_dword v2, v1, s[48:51], 0 offen offset:8

0 commit comments

Comments
 (0)