Skip to content

Commit 8726c2b

Browse files
committed
Redo it
1 parent 377996e commit 8726c2b

File tree

7 files changed

+276
-273
lines changed

7 files changed

+276
-273
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 50 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2434,7 +2434,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24342434
}
24352435
case AMDGPU::S_ADD_I32: {
24362436
// TODO: Handle s_or_b32, s_and_b32.
2437-
MachineOperand &OtherOp = MI->getOperand(FIOperandNum == 1 ? 2 : 1);
2437+
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2438+
MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
24382439

24392440
assert(FrameReg || MFI->isBottomOfStack());
24402441

@@ -2445,13 +2446,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24452446
// Defend against live scc, which should never happen in practice.
24462447
bool DeadSCC = MI->getOperand(3).isDead();
24472448

2449+
Register TmpReg;
2450+
24482451
// Do an in-place scale of the wave offset to the lane offset.
24492452
if (FrameReg && !ST.enableFlatScratch()) {
24502453
// FIXME: In the common case where the add does not also read its result
24512454
// (i.e. this isn't a reg += fi), it's not finding the dest reg as
24522455
// available.
2453-
Register TmpReg = RS->scavengeRegisterBackwards(
2454-
AMDGPU::SReg_32_XM0RegClass, MI, false, 0);
2456+
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, MI,
2457+
false, 0);
24552458
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
24562459
.addDef(TmpReg, RegState::Renamable)
24572460
.addReg(FrameReg)
@@ -2460,48 +2463,65 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24602463
MaterializedReg = TmpReg;
24612464
}
24622465

2466+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2467+
2468+
// For the non-immediate case, we could fall through to the default
2469+
// handling, but we do an in-place update of the result register here to
2470+
// avoid scavenging another register.
2471+
if (OtherOp.isImm()) {
2472+
OtherOp.setImm(OtherOp.getImm() + Offset);
2473+
Offset = 0;
2474+
}
2475+
24632476
// If we can't fold the other operand, do another increment.
24642477
if (!OtherOp.isImm() && MaterializedReg) {
2478+
Register DstReg = DstOp.getReg();
2479+
2480+
if (!TmpReg && MaterializedReg == FrameReg) {
2481+
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2482+
MI, false, 0);
2483+
DstReg = TmpReg;
2484+
}
2485+
24652486
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
2466-
.addDef(DstOp.getReg(), RegState::Renamable)
2467-
.addReg(MaterializedReg)
2487+
.addDef(DstReg, RegState::Renamable)
2488+
.addReg(MaterializedReg, RegState::Kill)
24682489
.add(OtherOp);
24692490
if (DeadSCC)
24702491
AddI32.setOperandDead(3);
2471-
MaterializedReg = DstOp.getReg();
2472-
}
24732492

2474-
int64_t NewOffset = FrameInfo.getObjectOffset(Index);
2493+
MaterializedReg = DstReg;
24752494

2476-
// For the non-immediate case, we could fall through to the default
2477-
// handling, but we do an in-place update of the result register here to
2478-
// avoid scavenging another register.
2479-
if (OtherOp.isImm())
2480-
NewOffset += OtherOp.getImm();
2495+
OtherOp.ChangeToRegister(MaterializedReg, false);
2496+
OtherOp.setIsKill(true);
2497+
OtherOp.setIsRenamable(true);
2498+
FIOp.ChangeToImmediate(Offset);
2499+
} else if (!OtherOp.isImm() && !MaterializedReg) {
2500+
FIOp.ChangeToImmediate(Offset);
2501+
} else {
2502+
assert(Offset == 0);
24812503

2482-
if (NewOffset == 0 && DeadSCC && DstOp.getReg() == MaterializedReg) {
2504+
if (MaterializedReg)
2505+
FIOp.ChangeToRegister(MaterializedReg, false);
2506+
else
2507+
FIOp.ChangeToImmediate(0);
2508+
}
2509+
2510+
if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2511+
assert(Offset == 0);
2512+
MI->removeOperand(3);
2513+
MI->removeOperand(OtherOpIdx);
2514+
MI->setDesc(TII->get(FIOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2515+
} else if (DeadSCC && FIOp.isImm() && FIOp.getImm() == 0) {
2516+
assert(Offset == 0);
24832517
MI->removeOperand(3);
24842518
MI->removeOperand(FIOperandNum);
24852519
MI->setDesc(
24862520
TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2487-
} else if (!MaterializedReg && OtherOp.isImm()) {
2488-
// In a kernel, the address should just be an immediate.
2489-
// SCC should really be dead, but preserve the def just in case it
2490-
// isn't.
2491-
if (DeadSCC)
2492-
MI->removeOperand(3);
2493-
else
2494-
MI->getOperand(3).setIsDef(true);
2495-
2496-
MI->removeOperand(2);
2497-
MI->getOperand(1).ChangeToImmediate(NewOffset);
2498-
MI->setDesc(TII->get(AMDGPU::S_MOV_B32));
2499-
} else {
2500-
if (MaterializedReg)
2501-
OtherOp.ChangeToRegister(MaterializedReg, false);
2502-
FIOp.ChangeToImmediate(NewOffset);
25032521
}
25042522

2523+
assert(!FIOp.isFI());
2524+
25052525
return true;
25062526
}
25072527
default: {

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
1515
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1616
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
1717
; GFX9-NEXT: s_and_b32 s0, s0, 15
18-
; GFX9-NEXT: s_add_i32 s1, s1, 0
1918
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
2019
; GFX9-NEXT: scratch_store_dword off, v0, s1
2120
; GFX9-NEXT: s_waitcnt vmcnt(0)
22-
; GFX9-NEXT: s_add_i32 s0, s0, 0
2321
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
2422
; GFX9-NEXT: s_waitcnt vmcnt(0)
2523
; GFX9-NEXT: s_endpgm
@@ -36,8 +34,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
3634
; GFX10-NEXT: s_and_b32 s1, s0, 15
3735
; GFX10-NEXT: s_lshl_b32 s0, s0, 2
3836
; GFX10-NEXT: s_lshl_b32 s1, s1, 2
39-
; GFX10-NEXT: s_add_i32 s0, s0, 0
40-
; GFX10-NEXT: s_add_i32 s1, s1, 0
4137
; GFX10-NEXT: scratch_store_dword off, v0, s0
4238
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4339
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
@@ -51,11 +47,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
5147
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
5248
; GFX940-NEXT: s_lshl_b32 s1, s0, 2
5349
; GFX940-NEXT: s_and_b32 s0, s0, 15
54-
; GFX940-NEXT: s_add_i32 s1, s1, 0
5550
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
5651
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
5752
; GFX940-NEXT: s_waitcnt vmcnt(0)
58-
; GFX940-NEXT: s_add_i32 s0, s0, 0
5953
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
6054
; GFX940-NEXT: s_waitcnt vmcnt(0)
6155
; GFX940-NEXT: s_endpgm
@@ -68,8 +62,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
6862
; GFX11-NEXT: s_and_b32 s1, s0, 15
6963
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
7064
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
71-
; GFX11-NEXT: s_add_i32 s0, s0, 0
72-
; GFX11-NEXT: s_add_i32 s1, s1, 0
7365
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
7466
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
7567
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
@@ -84,8 +76,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
8476
; GFX12-NEXT: s_and_b32 s1, s0, 15
8577
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
8678
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
87-
; GFX12-NEXT: s_add_co_i32 s0, s0, 0
88-
; GFX12-NEXT: s_add_co_i32 s1, s1, 0
8979
; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
9080
; GFX12-NEXT: s_wait_storecnt 0x0
9181
; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1044,11 +1034,11 @@ define void @store_load_large_imm_offset_foo() {
10441034
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10451035
; GFX9-NEXT: s_movk_i32 s0, 0x3e80
10461036
; GFX9-NEXT: v_mov_b32_e32 v0, 13
1047-
; GFX9-NEXT: s_add_i32 s0, s32, s0
1037+
; GFX9-NEXT: s_add_i32 s1, s32, s0
10481038
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
10491039
; GFX9-NEXT: s_waitcnt vmcnt(0)
10501040
; GFX9-NEXT: v_mov_b32_e32 v0, 15
1051-
; GFX9-NEXT: s_add_i32 s0, s0, 4
1041+
; GFX9-NEXT: s_add_i32 s0, s1, 4
10521042
; GFX9-NEXT: scratch_store_dword off, v0, s0
10531043
; GFX9-NEXT: s_waitcnt vmcnt(0)
10541044
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -1061,8 +1051,8 @@ define void @store_load_large_imm_offset_foo() {
10611051
; GFX10-NEXT: v_mov_b32_e32 v0, 13
10621052
; GFX10-NEXT: s_movk_i32 s0, 0x3e80
10631053
; GFX10-NEXT: v_mov_b32_e32 v1, 15
1064-
; GFX10-NEXT: s_add_i32 s0, s32, s0
1065-
; GFX10-NEXT: s_add_i32 s0, s0, 4
1054+
; GFX10-NEXT: s_add_i32 s1, s32, s0
1055+
; GFX10-NEXT: s_add_i32 s0, s1, 4
10661056
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
10671057
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
10681058
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -1076,11 +1066,11 @@ define void @store_load_large_imm_offset_foo() {
10761066
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10771067
; GFX940-NEXT: s_movk_i32 s0, 0x3e80
10781068
; GFX940-NEXT: v_mov_b32_e32 v0, 13
1079-
; GFX940-NEXT: s_add_i32 s0, s32, s0
1069+
; GFX940-NEXT: s_add_i32 s1, s32, s0
10801070
; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
10811071
; GFX940-NEXT: s_waitcnt vmcnt(0)
10821072
; GFX940-NEXT: v_mov_b32_e32 v0, 15
1083-
; GFX940-NEXT: s_add_i32 s0, s0, 4
1073+
; GFX940-NEXT: s_add_i32 s0, s1, 4
10841074
; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
10851075
; GFX940-NEXT: s_waitcnt vmcnt(0)
10861076
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -1093,8 +1083,8 @@ define void @store_load_large_imm_offset_foo() {
10931083
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
10941084
; GFX11-NEXT: s_movk_i32 s0, 0x3e80
10951085
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1096-
; GFX11-NEXT: s_add_i32 s0, s32, s0
1097-
; GFX11-NEXT: s_add_i32 s0, s0, 4
1086+
; GFX11-NEXT: s_add_i32 s1, s32, s0
1087+
; GFX11-NEXT: s_add_i32 s0, s1, 4
10981088
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
10991089
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
11001090
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc

0 commit comments

Comments
 (0)