Skip to content

Commit 57d10b4

Browse files
[AMDGPU] Inplace FI elimination during PEI for scalar copy instruction (#99556)
eliminateFrameIndex wasn't handling the FI copy to Scalar registers and the default implementation breaks the code while trying to handle it. This patch handles the broken lowering and also takes care of some edge cases that might arise. This case is tricky for non-zero offset, scc & vcc is live and we don't find sgpr pair available. Co-authored by @arsenm --------- Co-authored-by: Matt Arsenault <[email protected]> Co-authored-by: PankajDwivedi-25 <[email protected]>
1 parent 996075d commit 57d10b4

File tree

3 files changed

+1760
-18
lines changed

3 files changed

+1760
-18
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 86 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2449,7 +2449,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24492449
? &AMDGPU::SReg_32RegClass
24502450
: &AMDGPU::VGPR_32RegClass;
24512451
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
2452-
MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
2452+
MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
2453+
MI->getOpcode() == AMDGPU::S_MOV_B32;
24532454
Register ResultReg =
24542455
IsCopy ? MI->getOperand(0).getReg()
24552456
: RS->scavengeRegisterBackwards(*RC, MI, false, 0);
@@ -2458,7 +2459,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24582459
if (Offset == 0) {
24592460
unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
24602461
: AMDGPU::V_LSHRREV_B32_e64;
2461-
auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
2462+
Register TmpResultReg = ResultReg;
2463+
if (IsSALU && LiveSCC) {
2464+
TmpResultReg = RS->scavengeRegisterBackwards(
2465+
AMDGPU::VGPR_32RegClass, MI, false, 0);
2466+
}
2467+
2468+
auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
24622469
if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
24632470
// For V_LSHRREV, the operands are reversed (the shift count goes
24642471
// first).
@@ -2468,11 +2475,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24682475
if (IsSALU && !LiveSCC)
24692476
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
24702477
if (IsSALU && LiveSCC) {
2471-
Register NewDest = RS->scavengeRegisterBackwards(
2472-
AMDGPU::SReg_32RegClass, Shift, false, 0);
2478+
Register NewDest =
2479+
IsCopy ? ResultReg
2480+
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
2481+
Shift, false, 0);
24732482
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
24742483
NewDest)
2475-
.addReg(ResultReg);
2484+
.addReg(TmpResultReg);
24762485
ResultReg = NewDest;
24772486
}
24782487
} else {
@@ -2523,22 +2532,82 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25232532

25242533
// We may have 1 free scratch SGPR even though a carry out is
25252534
// unavailable. Only one additional mov is needed.
2526-
Register TmpScaledReg = RS->scavengeRegisterBackwards(
2527-
AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
2528-
Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
2535+
Register TmpScaledReg = IsCopy && IsSALU
2536+
? ResultReg
2537+
: RS->scavengeRegisterBackwards(
2538+
AMDGPU::SReg_32_XM0RegClass, MI,
2539+
false, 0, /*AllowSpill=*/false);
2540+
Register ScaledReg =
2541+
TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
2542+
Register TmpResultReg = ScaledReg;
2543+
2544+
if (!LiveSCC) {
2545+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
2546+
.addReg(FrameReg)
2547+
.addImm(ST.getWavefrontSizeLog2());
2548+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
2549+
.addReg(TmpResultReg, RegState::Kill)
2550+
.addImm(Offset);
2551+
} else {
2552+
TmpResultReg = RS->scavengeRegisterBackwards(
2553+
AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
2554+
2555+
MachineInstrBuilder Add;
2556+
if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
2557+
BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2558+
TmpResultReg)
2559+
.addImm(ST.getWavefrontSizeLog2())
2560+
.addReg(FrameReg);
2561+
if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
2562+
BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32),
2563+
ResultReg)
2564+
.addImm(Offset);
2565+
Add.addReg(ResultReg, RegState::Kill)
2566+
.addReg(TmpResultReg, RegState::Kill)
2567+
.addImm(0);
2568+
} else
2569+
Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
2570+
} else {
2571+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
2572+
TmpResultReg)
2573+
.addImm(Offset);
2574+
assert(Offset > 0 &&
2575+
isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
2576+
"offset is unsafe for v_mad_u32_u24");
2577+
// We start with a frame pointer with a wave space value, and an
2578+
// offset in lane-space. We are materializing a lane space
2579+
// value. We can either do a right shift of the frame pointer to
2580+
// get to lane space, or a left shift of the offset to get to
2581+
// wavespace. We can right shift after the computation to get
2582+
// back to the desired per-lane value.
2583+
// We are using the mad_u32_u24 primarily as an add with no
2584+
// carry out clobber.
2585+
Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
2586+
TmpResultReg)
2587+
.addReg(TmpResultReg, RegState::Kill)
2588+
.addImm(ST.getWavefrontSize())
2589+
.addReg(FrameReg)
2590+
.addImm(0);
2591+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2592+
TmpResultReg)
2593+
.addImm(ST.getWavefrontSizeLog2())
2594+
.addReg(FrameReg);
2595+
}
25292596

2530-
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
2531-
.addReg(FrameReg)
2532-
.addImm(ST.getWavefrontSizeLog2());
2533-
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
2534-
.addReg(ScaledReg, RegState::Kill)
2535-
.addImm(Offset);
2597+
Register NewDest = IsCopy ? ResultReg
2598+
: RS->scavengeRegisterBackwards(
2599+
AMDGPU::SReg_32RegClass, *Add,
2600+
false, 0, /*AllowSpill=*/true);
2601+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
2602+
NewDest)
2603+
.addReg(TmpResultReg);
2604+
ResultReg = NewDest;
2605+
}
25362606
if (!IsSALU)
25372607
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
2538-
.addReg(ScaledReg, RegState::Kill);
2608+
.addReg(TmpResultReg, RegState::Kill);
25392609
else
2540-
ResultReg = ScaledReg;
2541-
2610+
ResultReg = TmpResultReg;
25422611
// If there were truly no free SGPRs, we need to undo everything.
25432612
if (!TmpScaledReg.isValid()) {
25442613
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)

0 commit comments

Comments
 (0)