Skip to content

Commit 5943f86

Browse files
PankajDwivedi-25David Salinas
authored andcommitted
[AMDGPU] Inplace FI elimination during PEI for scalar copy instruction (llvm#99556)
eliminateFrameIndex wasn't handling the FI copy to Scalar registers and the default implementation breaks the code while trying to handle it. This patch handles the broken lowering and also takes care of some edge cases that might arise. This case is tricky for non-zero offset, scc & vcc is live and we don't find sgpr pair available. Co-authored by @arsenm --------- Co-authored-by: Matt Arsenault <[email protected]> Co-authored-by: PankajDwivedi-25 <[email protected]> Change-Id: I8db13328357723da96c1bfaa2fd2368430b17100
1 parent f56d140 commit 5943f86

File tree

3 files changed

+4108
-18
lines changed

3 files changed

+4108
-18
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 86 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2604,7 +2604,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26042604
? &AMDGPU::SReg_32RegClass
26052605
: &AMDGPU::VGPR_32RegClass;
26062606
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
2607-
MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
2607+
MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
2608+
MI->getOpcode() == AMDGPU::S_MOV_B32;
26082609
Register ResultReg =
26092610
IsCopy ? MI->getOperand(0).getReg()
26102611
: RS->scavengeRegisterBackwards(*RC, MI, false, 0);
@@ -2613,7 +2614,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26132614
if (Offset == 0) {
26142615
unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
26152616
: AMDGPU::V_LSHRREV_B32_e64;
2616-
auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
2617+
Register TmpResultReg = ResultReg;
2618+
if (IsSALU && LiveSCC) {
2619+
TmpResultReg = RS->scavengeRegisterBackwards(
2620+
AMDGPU::VGPR_32RegClass, MI, false, 0);
2621+
}
2622+
2623+
auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
26172624
if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
26182625
// For V_LSHRREV, the operands are reversed (the shift count goes
26192626
// first).
@@ -2623,11 +2630,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26232630
if (IsSALU && !LiveSCC)
26242631
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
26252632
if (IsSALU && LiveSCC) {
2626-
Register NewDest = RS->scavengeRegisterBackwards(
2627-
AMDGPU::SReg_32RegClass, Shift, false, 0);
2633+
Register NewDest =
2634+
IsCopy ? ResultReg
2635+
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
2636+
Shift, false, 0);
26282637
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
26292638
NewDest)
2630-
.addReg(ResultReg);
2639+
.addReg(TmpResultReg);
26312640
ResultReg = NewDest;
26322641
}
26332642
} else {
@@ -2678,22 +2687,82 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26782687

26792688
// We may have 1 free scratch SGPR even though a carry out is
26802689
// unavailable. Only one additional mov is needed.
2681-
Register TmpScaledReg = RS->scavengeRegisterBackwards(
2682-
AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
2683-
Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
2690+
Register TmpScaledReg = IsCopy && IsSALU
2691+
? ResultReg
2692+
: RS->scavengeRegisterBackwards(
2693+
AMDGPU::SReg_32_XM0RegClass, MI,
2694+
false, 0, /*AllowSpill=*/false);
2695+
Register ScaledReg =
2696+
TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
2697+
Register TmpResultReg = ScaledReg;
2698+
2699+
if (!LiveSCC) {
2700+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
2701+
.addReg(FrameReg)
2702+
.addImm(ST.getWavefrontSizeLog2());
2703+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
2704+
.addReg(TmpResultReg, RegState::Kill)
2705+
.addImm(Offset);
2706+
} else {
2707+
TmpResultReg = RS->scavengeRegisterBackwards(
2708+
AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
2709+
2710+
MachineInstrBuilder Add;
2711+
if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
2712+
BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2713+
TmpResultReg)
2714+
.addImm(ST.getWavefrontSizeLog2())
2715+
.addReg(FrameReg);
2716+
if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
2717+
BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32),
2718+
ResultReg)
2719+
.addImm(Offset);
2720+
Add.addReg(ResultReg, RegState::Kill)
2721+
.addReg(TmpResultReg, RegState::Kill)
2722+
.addImm(0);
2723+
} else
2724+
Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
2725+
} else {
2726+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
2727+
TmpResultReg)
2728+
.addImm(Offset);
2729+
assert(Offset > 0 &&
2730+
isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
2731+
"offset is unsafe for v_mad_u32_u24");
2732+
// We start with a frame pointer with a wave space value, and an
2733+
// offset in lane-space. We are materializing a lane space
2734+
// value. We can either do a right shift of the frame pointer to
2735+
// get to lane space, or a left shift of the offset to get to
2736+
// wavespace. We can right shift after the computation to get
2737+
// back to the desired per-lane value.
2738+
// We are using the mad_u32_u24 primarily as an add with no
2739+
// carry out clobber.
2740+
Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
2741+
TmpResultReg)
2742+
.addReg(TmpResultReg, RegState::Kill)
2743+
.addImm(ST.getWavefrontSize())
2744+
.addReg(FrameReg)
2745+
.addImm(0);
2746+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2747+
TmpResultReg)
2748+
.addImm(ST.getWavefrontSizeLog2())
2749+
.addReg(FrameReg);
2750+
}
26842751

2685-
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
2686-
.addReg(FrameReg)
2687-
.addImm(ST.getWavefrontSizeLog2());
2688-
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
2689-
.addReg(ScaledReg, RegState::Kill)
2690-
.addImm(Offset);
2752+
Register NewDest = IsCopy ? ResultReg
2753+
: RS->scavengeRegisterBackwards(
2754+
AMDGPU::SReg_32RegClass, *Add,
2755+
false, 0, /*AllowSpill=*/true);
2756+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
2757+
NewDest)
2758+
.addReg(TmpResultReg);
2759+
ResultReg = NewDest;
2760+
}
26912761
if (!IsSALU)
26922762
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
2693-
.addReg(ScaledReg, RegState::Kill);
2763+
.addReg(TmpResultReg, RegState::Kill);
26942764
else
2695-
ResultReg = ScaledReg;
2696-
2765+
ResultReg = TmpResultReg;
26972766
// If there were truly no free SGPRs, we need to undo everything.
26982767
if (!TmpScaledReg.isValid()) {
26992768
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)

0 commit comments

Comments
 (0)