Skip to content

Commit 498eec7

Browse files
PankajDwivedi-25vikramRH
authored andcommitted
[AMDGPU] Inplace FI elimination during PEI for scalar copy instruction (llvm#99556)
eliminateFrameIndex wasn't handling the FI copy to Scalar registers and the default implementation breaks the code while trying to handle it. This patch handles the broken lowering and also takes care of some edge cases that might arise. This case is tricky for non-zero offset, scc & vcc is live and we don't find sgpr pair available. Co-authored by @arsenm --------- Co-authored-by: Matt Arsenault <[email protected]> Co-authored-by: PankajDwivedi-25 <[email protected]> Change-Id: I760538f594d267bdc414833c4489486e426fb55b
1 parent b6253c6 commit 498eec7

File tree

3 files changed

+4120
-18
lines changed

3 files changed

+4120
-18
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 86 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2615,7 +2615,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26152615
? &AMDGPU::SReg_32RegClass
26162616
: &AMDGPU::VGPR_32RegClass;
26172617
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
2618-
MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
2618+
MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
2619+
MI->getOpcode() == AMDGPU::S_MOV_B32;
26192620
Register ResultReg =
26202621
IsCopy ? MI->getOperand(0).getReg()
26212622
: RS->scavengeRegisterBackwards(*RC, MI, false, 0);
@@ -2624,7 +2625,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26242625
if (Offset == 0) {
26252626
unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
26262627
: AMDGPU::V_LSHRREV_B32_e64;
2627-
auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
2628+
Register TmpResultReg = ResultReg;
2629+
if (IsSALU && LiveSCC) {
2630+
TmpResultReg = RS->scavengeRegisterBackwards(
2631+
AMDGPU::VGPR_32RegClass, MI, false, 0);
2632+
}
2633+
2634+
auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
26282635
if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
26292636
// For V_LSHRREV, the operands are reversed (the shift count goes
26302637
// first).
@@ -2634,11 +2641,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26342641
if (IsSALU && !LiveSCC)
26352642
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
26362643
if (IsSALU && LiveSCC) {
2637-
Register NewDest = RS->scavengeRegisterBackwards(
2638-
AMDGPU::SReg_32RegClass, Shift, false, 0);
2644+
Register NewDest =
2645+
IsCopy ? ResultReg
2646+
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
2647+
Shift, false, 0);
26392648
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
26402649
NewDest)
2641-
.addReg(ResultReg);
2650+
.addReg(TmpResultReg);
26422651
ResultReg = NewDest;
26432652
}
26442653
} else {
@@ -2689,22 +2698,82 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26892698

26902699
// We may have 1 free scratch SGPR even though a carry out is
26912700
// unavailable. Only one additional mov is needed.
2692-
Register TmpScaledReg = RS->scavengeRegisterBackwards(
2693-
AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
2694-
Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
2701+
Register TmpScaledReg = IsCopy && IsSALU
2702+
? ResultReg
2703+
: RS->scavengeRegisterBackwards(
2704+
AMDGPU::SReg_32_XM0RegClass, MI,
2705+
false, 0, /*AllowSpill=*/false);
2706+
Register ScaledReg =
2707+
TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
2708+
Register TmpResultReg = ScaledReg;
2709+
2710+
if (!LiveSCC) {
2711+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
2712+
.addReg(FrameReg)
2713+
.addImm(ST.getWavefrontSizeLog2());
2714+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
2715+
.addReg(TmpResultReg, RegState::Kill)
2716+
.addImm(Offset);
2717+
} else {
2718+
TmpResultReg = RS->scavengeRegisterBackwards(
2719+
AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
2720+
2721+
MachineInstrBuilder Add;
2722+
if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
2723+
BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2724+
TmpResultReg)
2725+
.addImm(ST.getWavefrontSizeLog2())
2726+
.addReg(FrameReg);
2727+
if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
2728+
BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32),
2729+
ResultReg)
2730+
.addImm(Offset);
2731+
Add.addReg(ResultReg, RegState::Kill)
2732+
.addReg(TmpResultReg, RegState::Kill)
2733+
.addImm(0);
2734+
} else
2735+
Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
2736+
} else {
2737+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
2738+
TmpResultReg)
2739+
.addImm(Offset);
2740+
assert(Offset > 0 &&
2741+
isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
2742+
"offset is unsafe for v_mad_u32_u24");
2743+
// We start with a frame pointer with a wave space value, and an
2744+
// offset in lane-space. We are materializing a lane space
2745+
// value. We can either do a right shift of the frame pointer to
2746+
// get to lane space, or a left shift of the offset to get to
2747+
// wavespace. We can right shift after the computation to get
2748+
// back to the desired per-lane value.
2749+
// We are using the mad_u32_u24 primarily as an add with no
2750+
// carry out clobber.
2751+
Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
2752+
TmpResultReg)
2753+
.addReg(TmpResultReg, RegState::Kill)
2754+
.addImm(ST.getWavefrontSize())
2755+
.addReg(FrameReg)
2756+
.addImm(0);
2757+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2758+
TmpResultReg)
2759+
.addImm(ST.getWavefrontSizeLog2())
2760+
.addReg(FrameReg);
2761+
}
26952762

2696-
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
2697-
.addReg(FrameReg)
2698-
.addImm(ST.getWavefrontSizeLog2());
2699-
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
2700-
.addReg(ScaledReg, RegState::Kill)
2701-
.addImm(Offset);
2763+
Register NewDest = IsCopy ? ResultReg
2764+
: RS->scavengeRegisterBackwards(
2765+
AMDGPU::SReg_32RegClass, *Add,
2766+
false, 0, /*AllowSpill=*/true);
2767+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
2768+
NewDest)
2769+
.addReg(TmpResultReg);
2770+
ResultReg = NewDest;
2771+
}
27022772
if (!IsSALU)
27032773
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
2704-
.addReg(ScaledReg, RegState::Kill);
2774+
.addReg(TmpResultReg, RegState::Kill);
27052775
else
2706-
ResultReg = ScaledReg;
2707-
2776+
ResultReg = TmpResultReg;
27082777
// If there were truly no free SGPRs, we need to undo everything.
27092778
if (!TmpScaledReg.isValid()) {
27102779
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)

0 commit comments

Comments
 (0)