Skip to content

AMDGPU: Handle folding frame indexes into s_add_i32 #101694

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2432,7 +2432,94 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MI->eraseFromParent();
return true;
}
case AMDGPU::S_ADD_I32: {
// TODO: Handle s_or_b32, s_and_b32.
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);

assert(FrameReg || MFI->isBottomOfStack());

MachineOperand &DstOp = MI->getOperand(0);
const DebugLoc &DL = MI->getDebugLoc();
Register MaterializedReg = FrameReg;

// Defend against live scc, which should never happen in practice.
bool DeadSCC = MI->getOperand(3).isDead();

Register TmpReg;

if (FrameReg && !ST.enableFlatScratch()) {
// FIXME: In the common case where the add does not also read its result
// (i.e. this isn't a reg += fi), it's not finding the dest reg as
// available.
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, MI,
false, 0);
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
.addDef(TmpReg, RegState::Renamable)
.addReg(FrameReg)
.addImm(ST.getWavefrontSizeLog2())
.setOperandDead(3); // Set SCC dead
MaterializedReg = TmpReg;
}

int64_t Offset = FrameInfo.getObjectOffset(Index);

// For the non-immediate case, we could fall through to the default
// handling, but we do an in-place update of the result register here to
// avoid scavenging another register.
if (OtherOp.isImm()) {
OtherOp.setImm(OtherOp.getImm() + Offset);
Offset = 0;

if (MaterializedReg)
FIOp.ChangeToRegister(MaterializedReg, false);
else
FIOp.ChangeToImmediate(0);
} else if (MaterializedReg) {
// If we can't fold the other operand, do another increment.
Register DstReg = DstOp.getReg();

if (!TmpReg && MaterializedReg == FrameReg) {
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
MI, false, 0);
DstReg = TmpReg;
}

auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
.addDef(DstReg, RegState::Renamable)
.addReg(MaterializedReg, RegState::Kill)
.add(OtherOp);
if (DeadSCC)
AddI32.setOperandDead(3);

MaterializedReg = DstReg;

OtherOp.ChangeToRegister(MaterializedReg, false);
OtherOp.setIsKill(true);
OtherOp.setIsRenamable(true);
FIOp.ChangeToImmediate(Offset);
} else {
// If we don't have any other offset to apply, we can just directly
// interpret the frame index as the offset.
FIOp.ChangeToImmediate(Offset);
}

if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
assert(Offset == 0);
MI->removeOperand(3);
MI->removeOperand(OtherOpIdx);
MI->setDesc(TII->get(FIOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
} else if (DeadSCC && FIOp.isImm() && FIOp.getImm() == 0) {
assert(Offset == 0);
MI->removeOperand(3);
MI->removeOperand(FIOperandNum);
MI->setDesc(
TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
}

assert(!FIOp.isFI());
return true;
}
default: {
// Other access to frame index
const DebugLoc &DL = MI->getDebugLoc();
Expand Down
34 changes: 12 additions & 22 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_add_i32 s1, s1, 0
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
; GFX9-NEXT: scratch_store_dword off, v0, s1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_add_i32 s0, s0, 0
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
Expand All @@ -36,8 +34,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX10-NEXT: s_and_b32 s1, s0, 15
; GFX10-NEXT: s_lshl_b32 s0, s0, 2
; GFX10-NEXT: s_lshl_b32 s1, s1, 2
; GFX10-NEXT: s_add_i32 s0, s0, 0
; GFX10-NEXT: s_add_i32 s1, s1, 0
; GFX10-NEXT: scratch_store_dword off, v0, s0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
Expand All @@ -51,11 +47,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: s_lshl_b32 s1, s0, 2
; GFX940-NEXT: s_and_b32 s0, s0, 15
; GFX940-NEXT: s_add_i32 s1, s1, 0
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_add_i32 s0, s0, 0
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_endpgm
Expand All @@ -68,8 +62,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX11-NEXT: s_and_b32 s1, s0, 15
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
; GFX11-NEXT: s_add_i32 s0, s0, 0
; GFX11-NEXT: s_add_i32 s1, s1, 0
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
Expand All @@ -84,8 +76,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX12-NEXT: s_and_b32 s1, s0, 15
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
; GFX12-NEXT: s_add_co_i32 s0, s0, 0
; GFX12-NEXT: s_add_co_i32 s1, s1, 0
; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
Expand Down Expand Up @@ -1042,13 +1032,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 13
; GFX9-NEXT: s_movk_i32 s0, 0x3e80
; GFX9-NEXT: s_add_i32 s1, s32, 4
; GFX9-NEXT: v_mov_b32_e32 v0, 13
; GFX9-NEXT: s_add_i32 s1, s32, s0
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: s_add_i32 s0, s0, s1
; GFX9-NEXT: s_add_i32 s0, s1, 4
; GFX9-NEXT: scratch_store_dword off, v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
Expand All @@ -1059,10 +1049,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
; GFX10-NEXT: s_movk_i32 s0, 0x3e80
; GFX10-NEXT: s_add_i32 s1, s32, 4
; GFX10-NEXT: s_add_i32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v1, 15
; GFX10-NEXT: s_add_i32 s1, s32, s0
; GFX10-NEXT: s_add_i32 s0, s1, 4
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0
Expand All @@ -1074,13 +1064,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX940-LABEL: store_load_large_imm_offset_foo:
; GFX940: ; %bb.0: ; %bb
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, 13
; GFX940-NEXT: s_movk_i32 s0, 0x3e80
; GFX940-NEXT: s_add_i32 s1, s32, 4
; GFX940-NEXT: v_mov_b32_e32 v0, 13
; GFX940-NEXT: s_add_i32 s1, s32, s0
; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, 15
; GFX940-NEXT: s_add_i32 s0, s0, s1
; GFX940-NEXT: s_add_i32 s0, s1, 4
; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
Expand All @@ -1092,9 +1082,9 @@ define void @store_load_large_imm_offset_foo() {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
; GFX11-NEXT: s_movk_i32 s0, 0x3e80
; GFX11-NEXT: s_add_i32 s1, s32, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s1, s32, s0
; GFX11-NEXT: s_add_i32 s0, s1, 4
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
Expand Down
Loading
Loading