Skip to content

Commit 897632d

Browse files
committed
AMDGPU: Handle folding frame indexes into s_add_i32
1 parent 7022498 commit 897632d

File tree

7 files changed

+293
-257
lines changed

7 files changed

+293
-257
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2432,7 +2432,70 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24322432
MI->eraseFromParent();
24332433
return true;
24342434
}
2435+
case AMDGPU::S_ADD_I32: {
2436+
// TODO: Handle s_or_b32, s_and_b32.
2437+
MachineOperand &OtherOp = MI->getOperand(FIOperandNum == 1 ? 2 : 1);
24352438

2439+
assert(FrameReg || MFI->isBottomOfStack());
2440+
2441+
MachineOperand &DstOp = MI->getOperand(0);
2442+
const DebugLoc &DL = MI->getDebugLoc();
2443+
Register MaterializedReg = FrameReg;
2444+
2445+
// Defend against live scc, which should never happen in practice.
2446+
bool DeadSCC = MI->getOperand(3).isDead();
2447+
2448+
// Do an in-place scale of the wave offset to the lane offset.
2449+
if (FrameReg && !ST.enableFlatScratch()) {
2450+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2451+
.addDef(DstOp.getReg(), RegState::Renamable)
2452+
.addReg(FrameReg)
2453+
.addImm(ST.getWavefrontSizeLog2())
2454+
.setOperandDead(3); // Set SCC dead
2455+
MaterializedReg = DstOp.getReg();
2456+
}
2457+
2458+
// If we can't fold the other operand, do another increment.
2459+
if (!OtherOp.isImm() && MaterializedReg) {
2460+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
2461+
.addDef(DstOp.getReg(), RegState::Renamable)
2462+
.addReg(MaterializedReg)
2463+
.add(OtherOp);
2464+
if (DeadSCC)
2465+
AddI32.setOperandDead(3);
2466+
MaterializedReg = DstOp.getReg();
2467+
}
2468+
2469+
int64_t NewOffset = FrameInfo.getObjectOffset(Index);
2470+
2471+
// For the non-immediate case, we could fall through to the default
2472+
// handling, but we do an in-place update of the result register here to
2473+
// avoid scavenging another register.
2474+
if (OtherOp.isImm())
2475+
NewOffset += OtherOp.getImm();
2476+
2477+
if (NewOffset == 0 && DeadSCC) {
2478+
MI->eraseFromParent();
2479+
} else if (!MaterializedReg && OtherOp.isImm()) {
2480+
// In a kernel, the address should just be an immediate.
2481+
// SCC should really be dead, but preserve the def just in case it
2482+
// isn't.
2483+
if (DeadSCC)
2484+
MI->removeOperand(3);
2485+
else
2486+
MI->getOperand(3).setIsDef(true);
2487+
2488+
MI->removeOperand(2);
2489+
MI->getOperand(1).ChangeToImmediate(NewOffset);
2490+
MI->setDesc(TII->get(AMDGPU::S_MOV_B32));
2491+
} else {
2492+
if (MaterializedReg)
2493+
OtherOp.ChangeToRegister(MaterializedReg, false);
2494+
FIOp.ChangeToImmediate(NewOffset);
2495+
}
2496+
2497+
return true;
2498+
}
24362499
default: {
24372500
// Other access to frame index
24382501
const DebugLoc &DL = MI->getDebugLoc();

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
1515
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1616
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
1717
; GFX9-NEXT: s_and_b32 s0, s0, 15
18-
; GFX9-NEXT: s_add_i32 s1, s1, 0
1918
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
2019
; GFX9-NEXT: scratch_store_dword off, v0, s1
2120
; GFX9-NEXT: s_waitcnt vmcnt(0)
22-
; GFX9-NEXT: s_add_i32 s0, s0, 0
2321
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
2422
; GFX9-NEXT: s_waitcnt vmcnt(0)
2523
; GFX9-NEXT: s_endpgm
@@ -36,8 +34,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
3634
; GFX10-NEXT: s_and_b32 s1, s0, 15
3735
; GFX10-NEXT: s_lshl_b32 s0, s0, 2
3836
; GFX10-NEXT: s_lshl_b32 s1, s1, 2
39-
; GFX10-NEXT: s_add_i32 s0, s0, 0
40-
; GFX10-NEXT: s_add_i32 s1, s1, 0
4137
; GFX10-NEXT: scratch_store_dword off, v0, s0
4238
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4339
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
@@ -51,11 +47,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
5147
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
5248
; GFX940-NEXT: s_lshl_b32 s1, s0, 2
5349
; GFX940-NEXT: s_and_b32 s0, s0, 15
54-
; GFX940-NEXT: s_add_i32 s1, s1, 0
5550
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
5651
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
5752
; GFX940-NEXT: s_waitcnt vmcnt(0)
58-
; GFX940-NEXT: s_add_i32 s0, s0, 0
5953
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
6054
; GFX940-NEXT: s_waitcnt vmcnt(0)
6155
; GFX940-NEXT: s_endpgm
@@ -68,8 +62,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
6862
; GFX11-NEXT: s_and_b32 s1, s0, 15
6963
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
7064
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
71-
; GFX11-NEXT: s_add_i32 s0, s0, 0
72-
; GFX11-NEXT: s_add_i32 s1, s1, 0
7365
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
7466
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
7567
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
@@ -84,8 +76,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
8476
; GFX12-NEXT: s_and_b32 s1, s0, 15
8577
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
8678
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
87-
; GFX12-NEXT: s_add_co_i32 s0, s0, 0
88-
; GFX12-NEXT: s_add_co_i32 s1, s1, 0
8979
; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
9080
; GFX12-NEXT: s_wait_storecnt 0x0
9181
; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1042,13 +1032,13 @@ define void @store_load_large_imm_offset_foo() {
10421032
; GFX9-LABEL: store_load_large_imm_offset_foo:
10431033
; GFX9: ; %bb.0: ; %bb
10441034
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1045-
; GFX9-NEXT: v_mov_b32_e32 v0, 13
10461035
; GFX9-NEXT: s_movk_i32 s0, 0x3e80
1047-
; GFX9-NEXT: s_add_i32 s1, s32, 4
1036+
; GFX9-NEXT: v_mov_b32_e32 v0, 13
1037+
; GFX9-NEXT: s_add_i32 s0, s32, s0
10481038
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
10491039
; GFX9-NEXT: s_waitcnt vmcnt(0)
10501040
; GFX9-NEXT: v_mov_b32_e32 v0, 15
1051-
; GFX9-NEXT: s_add_i32 s0, s0, s1
1041+
; GFX9-NEXT: s_add_i32 s0, s0, 4
10521042
; GFX9-NEXT: scratch_store_dword off, v0, s0
10531043
; GFX9-NEXT: s_waitcnt vmcnt(0)
10541044
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -1059,10 +1049,10 @@ define void @store_load_large_imm_offset_foo() {
10591049
; GFX10: ; %bb.0: ; %bb
10601050
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10611051
; GFX10-NEXT: v_mov_b32_e32 v0, 13
1062-
; GFX10-NEXT: v_mov_b32_e32 v1, 15
10631052
; GFX10-NEXT: s_movk_i32 s0, 0x3e80
1064-
; GFX10-NEXT: s_add_i32 s1, s32, 4
1065-
; GFX10-NEXT: s_add_i32 s0, s0, s1
1053+
; GFX10-NEXT: v_mov_b32_e32 v1, 15
1054+
; GFX10-NEXT: s_add_i32 s0, s32, s0
1055+
; GFX10-NEXT: s_add_i32 s0, s0, 4
10661056
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
10671057
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
10681058
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -1074,13 +1064,13 @@ define void @store_load_large_imm_offset_foo() {
10741064
; GFX940-LABEL: store_load_large_imm_offset_foo:
10751065
; GFX940: ; %bb.0: ; %bb
10761066
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1077-
; GFX940-NEXT: v_mov_b32_e32 v0, 13
10781067
; GFX940-NEXT: s_movk_i32 s0, 0x3e80
1079-
; GFX940-NEXT: s_add_i32 s1, s32, 4
1068+
; GFX940-NEXT: v_mov_b32_e32 v0, 13
1069+
; GFX940-NEXT: s_add_i32 s0, s32, s0
10801070
; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
10811071
; GFX940-NEXT: s_waitcnt vmcnt(0)
10821072
; GFX940-NEXT: v_mov_b32_e32 v0, 15
1083-
; GFX940-NEXT: s_add_i32 s0, s0, s1
1073+
; GFX940-NEXT: s_add_i32 s0, s0, 4
10841074
; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
10851075
; GFX940-NEXT: s_waitcnt vmcnt(0)
10861076
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -1092,9 +1082,9 @@ define void @store_load_large_imm_offset_foo() {
10921082
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10931083
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
10941084
; GFX11-NEXT: s_movk_i32 s0, 0x3e80
1095-
; GFX11-NEXT: s_add_i32 s1, s32, 4
1096-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1097-
; GFX11-NEXT: s_add_i32 s0, s0, s1
1085+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1086+
; GFX11-NEXT: s_add_i32 s0, s32, s0
1087+
; GFX11-NEXT: s_add_i32 s0, s0, 4
10981088
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
10991089
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
11001090
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc

0 commit comments

Comments
 (0)