Skip to content

Commit 7e25825

Browse files
arsenmeasyonaadit
authored andcommitted
AMDGPU: Handle folding frame indexes into s_add_i32 (llvm#101694)
This does not yet enable producing direct frame index references in s_add_i32, only the lowering.
1 parent 4995fb6 commit 7e25825

12 files changed

+1659
-369
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2445,7 +2445,94 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24452445
MI->eraseFromParent();
24462446
return true;
24472447
}
2448+
case AMDGPU::S_ADD_I32: {
2449+
// TODO: Handle s_or_b32, s_and_b32.
2450+
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2451+
MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
24482452

2453+
assert(FrameReg || MFI->isBottomOfStack());
2454+
2455+
MachineOperand &DstOp = MI->getOperand(0);
2456+
const DebugLoc &DL = MI->getDebugLoc();
2457+
Register MaterializedReg = FrameReg;
2458+
2459+
// Defend against live scc, which should never happen in practice.
2460+
bool DeadSCC = MI->getOperand(3).isDead();
2461+
2462+
Register TmpReg;
2463+
2464+
if (FrameReg && !ST.enableFlatScratch()) {
2465+
// FIXME: In the common case where the add does not also read its result
2466+
// (i.e. this isn't a reg += fi), it's not finding the dest reg as
2467+
// available.
2468+
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, MI,
2469+
false, 0);
2470+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2471+
.addDef(TmpReg, RegState::Renamable)
2472+
.addReg(FrameReg)
2473+
.addImm(ST.getWavefrontSizeLog2())
2474+
.setOperandDead(3); // Set SCC dead
2475+
MaterializedReg = TmpReg;
2476+
}
2477+
2478+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2479+
2480+
// For the non-immediate case, we could fall through to the default
2481+
// handling, but we do an in-place update of the result register here to
2482+
// avoid scavenging another register.
2483+
if (OtherOp.isImm()) {
2484+
OtherOp.setImm(OtherOp.getImm() + Offset);
2485+
Offset = 0;
2486+
2487+
if (MaterializedReg)
2488+
FIOp.ChangeToRegister(MaterializedReg, false);
2489+
else
2490+
FIOp.ChangeToImmediate(0);
2491+
} else if (MaterializedReg) {
2492+
// If we can't fold the other operand, do another increment.
2493+
Register DstReg = DstOp.getReg();
2494+
2495+
if (!TmpReg && MaterializedReg == FrameReg) {
2496+
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2497+
MI, false, 0);
2498+
DstReg = TmpReg;
2499+
}
2500+
2501+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
2502+
.addDef(DstReg, RegState::Renamable)
2503+
.addReg(MaterializedReg, RegState::Kill)
2504+
.add(OtherOp);
2505+
if (DeadSCC)
2506+
AddI32.setOperandDead(3);
2507+
2508+
MaterializedReg = DstReg;
2509+
2510+
OtherOp.ChangeToRegister(MaterializedReg, false);
2511+
OtherOp.setIsKill(true);
2512+
OtherOp.setIsRenamable(true);
2513+
FIOp.ChangeToImmediate(Offset);
2514+
} else {
2515+
// If we don't have any other offset to apply, we can just directly
2516+
// interpret the frame index as the offset.
2517+
FIOp.ChangeToImmediate(Offset);
2518+
}
2519+
2520+
if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2521+
assert(Offset == 0);
2522+
MI->removeOperand(3);
2523+
MI->removeOperand(OtherOpIdx);
2524+
MI->setDesc(TII->get(FIOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2525+
} else if (DeadSCC && FIOp.isImm() && FIOp.getImm() == 0) {
2526+
assert(Offset == 0);
2527+
MI->removeOperand(3);
2528+
MI->removeOperand(FIOperandNum);
2529+
MI->setDesc(
2530+
TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2531+
}
2532+
2533+
assert(!FIOp.isFI());
2534+
return true;
2535+
}
24492536
default: {
24502537
// Other access to frame index
24512538
const DebugLoc &DL = MI->getDebugLoc();

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
2121
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2222
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
2323
; GFX9-NEXT: s_and_b32 s0, s0, 15
24-
; GFX9-NEXT: s_add_i32 s1, s1, 0
2524
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
2625
; GFX9-NEXT: scratch_store_dword off, v0, s1
2726
; GFX9-NEXT: s_waitcnt vmcnt(0)
28-
; GFX9-NEXT: s_add_i32 s0, s0, 0
2927
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
3028
; GFX9-NEXT: s_waitcnt vmcnt(0)
3129
; GFX9-NEXT: s_endpgm
@@ -42,8 +40,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
4240
; GFX10-NEXT: s_and_b32 s1, s0, 15
4341
; GFX10-NEXT: s_lshl_b32 s0, s0, 2
4442
; GFX10-NEXT: s_lshl_b32 s1, s1, 2
45-
; GFX10-NEXT: s_add_i32 s0, s0, 0
46-
; GFX10-NEXT: s_add_i32 s1, s1, 0
4743
; GFX10-NEXT: scratch_store_dword off, v0, s0
4844
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4945
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
@@ -57,7 +53,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
5753
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
5854
; GFX940-NEXT: s_lshl_b32 s1, s0, 2
5955
; GFX940-NEXT: s_and_b32 s0, s0, 15
60-
; GFX940-NEXT: s_add_i32 s1, s1, 0
6156
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
6257
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
6358
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -75,7 +70,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
7570
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
7671
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
7772
; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
78-
; GFX11-NEXT: s_add_i32 s0, s0, 0
7973
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
8074
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
8175
; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
@@ -108,11 +102,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
108102
; UNALIGNED_GFX9-NEXT: s_waitcnt lgkmcnt(0)
109103
; UNALIGNED_GFX9-NEXT: s_lshl_b32 s1, s0, 2
110104
; UNALIGNED_GFX9-NEXT: s_and_b32 s0, s0, 15
111-
; UNALIGNED_GFX9-NEXT: s_add_i32 s1, s1, 0
112105
; UNALIGNED_GFX9-NEXT: s_lshl_b32 s0, s0, 2
113106
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s1
114107
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
115-
; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s0, 0
116108
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
117109
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
118110
; UNALIGNED_GFX9-NEXT: s_endpgm
@@ -129,8 +121,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
129121
; UNALIGNED_GFX10-NEXT: s_and_b32 s1, s0, 15
130122
; UNALIGNED_GFX10-NEXT: s_lshl_b32 s0, s0, 2
131123
; UNALIGNED_GFX10-NEXT: s_lshl_b32 s1, s1, 2
132-
; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s0, 0
133-
; UNALIGNED_GFX10-NEXT: s_add_i32 s1, s1, 0
134124
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, s0
135125
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
136126
; UNALIGNED_GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
@@ -144,7 +134,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
144134
; UNALIGNED_GFX940-NEXT: s_waitcnt lgkmcnt(0)
145135
; UNALIGNED_GFX940-NEXT: s_lshl_b32 s1, s0, 2
146136
; UNALIGNED_GFX940-NEXT: s_and_b32 s0, s0, 15
147-
; UNALIGNED_GFX940-NEXT: s_add_i32 s1, s1, 0
148137
; UNALIGNED_GFX940-NEXT: s_lshl_b32 s0, s0, 2
149138
; UNALIGNED_GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
150139
; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -162,7 +151,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
162151
; UNALIGNED_GFX11-NEXT: s_lshl_b32 s1, s1, 2
163152
; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
164153
; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
165-
; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s0, 0
166154
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
167155
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
168156
; UNALIGNED_GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
@@ -1923,13 +1911,13 @@ define void @store_load_large_imm_offset_foo() {
19231911
; GFX9-LABEL: store_load_large_imm_offset_foo:
19241912
; GFX9: ; %bb.0: ; %bb
19251913
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1926-
; GFX9-NEXT: v_mov_b32_e32 v0, 13
19271914
; GFX9-NEXT: s_movk_i32 s0, 0x3e80
1928-
; GFX9-NEXT: s_add_i32 s1, s32, 4
1915+
; GFX9-NEXT: v_mov_b32_e32 v0, 13
1916+
; GFX9-NEXT: s_add_i32 s1, s32, s0
19291917
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
19301918
; GFX9-NEXT: s_waitcnt vmcnt(0)
19311919
; GFX9-NEXT: v_mov_b32_e32 v0, 15
1932-
; GFX9-NEXT: s_add_i32 s0, s0, s1
1920+
; GFX9-NEXT: s_add_i32 s0, s1, 4
19331921
; GFX9-NEXT: scratch_store_dword off, v0, s0
19341922
; GFX9-NEXT: s_waitcnt vmcnt(0)
19351923
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -1940,10 +1928,10 @@ define void @store_load_large_imm_offset_foo() {
19401928
; GFX10: ; %bb.0: ; %bb
19411929
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19421930
; GFX10-NEXT: v_mov_b32_e32 v0, 13
1943-
; GFX10-NEXT: v_mov_b32_e32 v1, 15
19441931
; GFX10-NEXT: s_movk_i32 s0, 0x3e80
1945-
; GFX10-NEXT: s_add_i32 s1, s32, 4
1946-
; GFX10-NEXT: s_add_i32 s0, s0, s1
1932+
; GFX10-NEXT: v_mov_b32_e32 v1, 15
1933+
; GFX10-NEXT: s_add_i32 s1, s32, s0
1934+
; GFX10-NEXT: s_add_i32 s0, s1, 4
19471935
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
19481936
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
19491937
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -1999,13 +1987,13 @@ define void @store_load_large_imm_offset_foo() {
19991987
; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_foo:
20001988
; UNALIGNED_GFX9: ; %bb.0: ; %bb
20011989
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2002-
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 13
20031990
; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80
2004-
; UNALIGNED_GFX9-NEXT: s_add_i32 s1, s32, 4
1991+
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 13
1992+
; UNALIGNED_GFX9-NEXT: s_add_i32 s1, s32, s0
20051993
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
20061994
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
20071995
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15
2008-
; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s0, s1
1996+
; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s1, 4
20091997
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0
20101998
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
20111999
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2016,10 +2004,10 @@ define void @store_load_large_imm_offset_foo() {
20162004
; UNALIGNED_GFX10: ; %bb.0: ; %bb
20172005
; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20182006
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13
2019-
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
20202007
; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80
2021-
; UNALIGNED_GFX10-NEXT: s_add_i32 s1, s32, 4
2022-
; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s0, s1
2008+
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
2009+
; UNALIGNED_GFX10-NEXT: s_add_i32 s1, s32, s0
2010+
; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s1, 4
20232011
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
20242012
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
20252013
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0

0 commit comments

Comments
 (0)