Skip to content

Commit 18de2ef

Browse files
committed
AMDGPU: Handle folding frame indexes into s_add_i32
1 parent 36f0d64 commit 18de2ef

File tree

7 files changed

+293
-257
lines changed

7 files changed

+293
-257
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2268,7 +2268,70 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22682268
MI->eraseFromParent();
22692269
return true;
22702270
}
2271+
case AMDGPU::S_ADD_I32: {
2272+
// TODO: Handle s_or_b32, s_and_b32.
2273+
MachineOperand &OtherOp = MI->getOperand(FIOperandNum == 1 ? 2 : 1);
22712274

2275+
assert(FrameReg || MFI->isBottomOfStack());
2276+
2277+
MachineOperand &DstOp = MI->getOperand(0);
2278+
const DebugLoc &DL = MI->getDebugLoc();
2279+
Register MaterializedReg = FrameReg;
2280+
2281+
// Defend against live scc, which should never happen in practice.
2282+
bool DeadSCC = MI->getOperand(3).isDead();
2283+
2284+
// Do an in-place scale of the wave offset to the lane offset.
2285+
if (FrameReg && !ST.enableFlatScratch()) {
2286+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2287+
.addDef(DstOp.getReg(), RegState::Renamable)
2288+
.addReg(FrameReg)
2289+
.addImm(ST.getWavefrontSizeLog2())
2290+
.setOperandDead(3); // Set SCC dead
2291+
MaterializedReg = DstOp.getReg();
2292+
}
2293+
2294+
// If we can't fold the other operand, do another increment.
2295+
if (!OtherOp.isImm() && MaterializedReg) {
2296+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
2297+
.addDef(DstOp.getReg(), RegState::Renamable)
2298+
.addReg(MaterializedReg)
2299+
.add(OtherOp);
2300+
if (DeadSCC)
2301+
AddI32.setOperandDead(3);
2302+
MaterializedReg = DstOp.getReg();
2303+
}
2304+
2305+
int64_t NewOffset = FrameInfo.getObjectOffset(Index);
2306+
2307+
// For the non-immediate case, we could fall through to the default
2308+
// handling, but we do an in-place update of the result register here to
2309+
// avoid scavenging another register.
2310+
if (OtherOp.isImm())
2311+
NewOffset += OtherOp.getImm();
2312+
2313+
if (NewOffset == 0 && DeadSCC) {
2314+
MI->eraseFromParent();
2315+
} else if (!MaterializedReg && OtherOp.isImm()) {
2316+
// In a kernel, the address should just be an immediate.
2317+
// SCC should really be dead, but preserve the def just in case it
2318+
// isn't.
2319+
if (DeadSCC)
2320+
MI->removeOperand(3);
2321+
else
2322+
MI->getOperand(3).setIsDef(true);
2323+
2324+
MI->removeOperand(2);
2325+
MI->getOperand(1).ChangeToImmediate(NewOffset);
2326+
MI->setDesc(TII->get(AMDGPU::S_MOV_B32));
2327+
} else {
2328+
if (MaterializedReg)
2329+
OtherOp.ChangeToRegister(MaterializedReg, false);
2330+
FIOp.ChangeToImmediate(NewOffset);
2331+
}
2332+
2333+
return true;
2334+
}
22722335
default: {
22732336
// Other access to frame index
22742337
const DebugLoc &DL = MI->getDebugLoc();

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
1515
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1616
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
1717
; GFX9-NEXT: s_and_b32 s0, s0, 15
18-
; GFX9-NEXT: s_add_i32 s1, s1, 0
1918
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
2019
; GFX9-NEXT: scratch_store_dword off, v0, s1
2120
; GFX9-NEXT: s_waitcnt vmcnt(0)
22-
; GFX9-NEXT: s_add_i32 s0, s0, 0
2321
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
2422
; GFX9-NEXT: s_waitcnt vmcnt(0)
2523
; GFX9-NEXT: s_endpgm
@@ -36,8 +34,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
3634
; GFX10-NEXT: s_and_b32 s1, s0, 15
3735
; GFX10-NEXT: s_lshl_b32 s0, s0, 2
3836
; GFX10-NEXT: s_lshl_b32 s1, s1, 2
39-
; GFX10-NEXT: s_add_i32 s0, s0, 0
40-
; GFX10-NEXT: s_add_i32 s1, s1, 0
4137
; GFX10-NEXT: scratch_store_dword off, v0, s0
4238
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4339
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
@@ -51,11 +47,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
5147
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
5248
; GFX940-NEXT: s_lshl_b32 s1, s0, 2
5349
; GFX940-NEXT: s_and_b32 s0, s0, 15
54-
; GFX940-NEXT: s_add_i32 s1, s1, 0
5550
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
5651
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
5752
; GFX940-NEXT: s_waitcnt vmcnt(0)
58-
; GFX940-NEXT: s_add_i32 s0, s0, 0
5953
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
6054
; GFX940-NEXT: s_waitcnt vmcnt(0)
6155
; GFX940-NEXT: s_endpgm
@@ -68,8 +62,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
6862
; GFX11-NEXT: s_and_b32 s1, s0, 15
6963
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
7064
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
71-
; GFX11-NEXT: s_add_i32 s0, s0, 0
72-
; GFX11-NEXT: s_add_i32 s1, s1, 0
7365
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
7466
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
7567
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
@@ -84,8 +76,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
8476
; GFX12-NEXT: s_and_b32 s1, s0, 15
8577
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
8678
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
87-
; GFX12-NEXT: s_add_co_i32 s0, s0, 0
88-
; GFX12-NEXT: s_add_co_i32 s1, s1, 0
8979
; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
9080
; GFX12-NEXT: s_wait_storecnt 0x0
9181
; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1042,13 +1032,13 @@ define void @store_load_large_imm_offset_foo() {
10421032
; GFX9-LABEL: store_load_large_imm_offset_foo:
10431033
; GFX9: ; %bb.0: ; %bb
10441034
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1045-
; GFX9-NEXT: v_mov_b32_e32 v0, 13
10461035
; GFX9-NEXT: s_movk_i32 s0, 0x3e80
1047-
; GFX9-NEXT: s_add_i32 s1, s32, 4
1036+
; GFX9-NEXT: v_mov_b32_e32 v0, 13
1037+
; GFX9-NEXT: s_add_i32 s0, s32, s0
10481038
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
10491039
; GFX9-NEXT: s_waitcnt vmcnt(0)
10501040
; GFX9-NEXT: v_mov_b32_e32 v0, 15
1051-
; GFX9-NEXT: s_add_i32 s0, s0, s1
1041+
; GFX9-NEXT: s_add_i32 s0, s0, 4
10521042
; GFX9-NEXT: scratch_store_dword off, v0, s0
10531043
; GFX9-NEXT: s_waitcnt vmcnt(0)
10541044
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -1059,10 +1049,10 @@ define void @store_load_large_imm_offset_foo() {
10591049
; GFX10: ; %bb.0: ; %bb
10601050
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10611051
; GFX10-NEXT: v_mov_b32_e32 v0, 13
1062-
; GFX10-NEXT: v_mov_b32_e32 v1, 15
10631052
; GFX10-NEXT: s_movk_i32 s0, 0x3e80
1064-
; GFX10-NEXT: s_add_i32 s1, s32, 4
1065-
; GFX10-NEXT: s_add_i32 s0, s0, s1
1053+
; GFX10-NEXT: v_mov_b32_e32 v1, 15
1054+
; GFX10-NEXT: s_add_i32 s0, s32, s0
1055+
; GFX10-NEXT: s_add_i32 s0, s0, 4
10661056
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
10671057
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
10681058
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -1074,13 +1064,13 @@ define void @store_load_large_imm_offset_foo() {
10741064
; GFX940-LABEL: store_load_large_imm_offset_foo:
10751065
; GFX940: ; %bb.0: ; %bb
10761066
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1077-
; GFX940-NEXT: v_mov_b32_e32 v0, 13
10781067
; GFX940-NEXT: s_movk_i32 s0, 0x3e80
1079-
; GFX940-NEXT: s_add_i32 s1, s32, 4
1068+
; GFX940-NEXT: v_mov_b32_e32 v0, 13
1069+
; GFX940-NEXT: s_add_i32 s0, s32, s0
10801070
; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
10811071
; GFX940-NEXT: s_waitcnt vmcnt(0)
10821072
; GFX940-NEXT: v_mov_b32_e32 v0, 15
1083-
; GFX940-NEXT: s_add_i32 s0, s0, s1
1073+
; GFX940-NEXT: s_add_i32 s0, s0, 4
10841074
; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
10851075
; GFX940-NEXT: s_waitcnt vmcnt(0)
10861076
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -1092,9 +1082,9 @@ define void @store_load_large_imm_offset_foo() {
10921082
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10931083
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
10941084
; GFX11-NEXT: s_movk_i32 s0, 0x3e80
1095-
; GFX11-NEXT: s_add_i32 s1, s32, 4
1096-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1097-
; GFX11-NEXT: s_add_i32 s0, s0, s1
1085+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1086+
; GFX11-NEXT: s_add_i32 s0, s32, s0
1087+
; GFX11-NEXT: s_add_i32 s0, s0, 4
10981088
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
10991089
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
11001090
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc

0 commit comments

Comments
 (0)