Skip to content

Commit 8576cb8

Browse files
committed
[WIP] AMDGPU: Handle v_add* in eliminateFrameIndex
1 parent 8726c2b commit 8576cb8

14 files changed

+802
-902
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 224 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2250,7 +2250,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22502250
assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
22512251
"unreserved scratch RSRC register");
22522252

2253-
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
2253+
MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
22542254
int Index = MI->getOperand(FIOperandNum).getIndex();
22552255

22562256
Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
@@ -2432,6 +2432,208 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24322432
MI->eraseFromParent();
24332433
return true;
24342434
}
2435+
case AMDGPU::V_ADD_U32_e32:
2436+
case AMDGPU::V_ADD_U32_e64:
2437+
case AMDGPU::V_ADD_CO_U32_e32:
2438+
case AMDGPU::V_ADD_CO_U32_e64: {
2439+
// TODO: Handle sub, and, or.
2440+
unsigned NumDefs = MI->getNumExplicitDefs();
2441+
unsigned Src0Idx = NumDefs;
2442+
2443+
bool HasClamp = false;
2444+
MachineOperand *VCCOp = nullptr;
2445+
2446+
switch (MI->getOpcode()) {
2447+
case AMDGPU::V_ADD_U32_e32:
2448+
break;
2449+
case AMDGPU::V_ADD_U32_e64:
2450+
HasClamp = MI->getOperand(3).getImm();
2451+
break;
2452+
case AMDGPU::V_ADD_CO_U32_e32:
2453+
VCCOp = &MI->getOperand(3);
2454+
break;
2455+
case AMDGPU::V_ADD_CO_U32_e64:
2456+
VCCOp = &MI->getOperand(1);
2457+
HasClamp = MI->getOperand(4).getImm();
2458+
break;
2459+
default:
2460+
break;
2461+
}
2462+
bool DeadVCC = !VCCOp || VCCOp->isDead();
2463+
MachineOperand &DstOp = MI->getOperand(0);
2464+
Register DstReg = DstOp.getReg();
2465+
2466+
unsigned OtherOpIdx =
2467+
FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2468+
MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2469+
2470+
unsigned Src1Idx = Src0Idx + 1;
2471+
Register MaterializedReg = FrameReg;
2472+
Register ScavengedVGPR;
2473+
2474+
if (FrameReg && !ST.enableFlatScratch()) {
2475+
// We should just do an in-place update of the result register. However,
2476+
// the value there may also be used by the add, in which case we need a
2477+
// temporary register.
2478+
//
2479+
// FIXME: The scavenger is not finding the result register in the
2480+
// common case where the add does not read the register.
2481+
2482+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2483+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2484+
2485+
// TODO: If we have a free SGPR, it's sometimes better to use a scalar
2486+
// shift.
2487+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2488+
.addDef(ScavengedVGPR, RegState::Renamable)
2489+
.addImm(ST.getWavefrontSizeLog2())
2490+
.addReg(FrameReg);
2491+
MaterializedReg = ScavengedVGPR;
2492+
}
2493+
2494+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2495+
// For the non-immediate case, we could fall through to the default
2496+
// handling, but we do an in-place update of the result register here to
2497+
// avoid scavenging another register.
2498+
if (OtherOp->isImm()) {
2499+
OtherOp->setImm(OtherOp->getImm() + Offset);
2500+
Offset = 0;
2501+
}
2502+
2503+
if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2504+
if (ST.enableFlatScratch() &&
2505+
!TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2506+
// We didn't need the shift above, so we have an SGPR for the frame
2507+
// register, but may have a VGPR only operand.
2508+
//
2509+
// TODO: On gfx10+, we can easily change the opcode to the e64 version
2510+
// and use the higher constant bus restriction to avoid this copy.
2511+
2512+
if (!ScavengedVGPR) {
2513+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2514+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2515+
/*SPAdj=*/0);
2516+
}
2517+
2518+
assert(ScavengedVGPR != DstReg);
2519+
2520+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2521+
.addReg(MaterializedReg,
2522+
MaterializedReg != FrameReg ? RegState::Kill : 0);
2523+
MaterializedReg = ScavengedVGPR;
2524+
}
2525+
2526+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2527+
.addDef(DstReg, RegState::Renamable);
2528+
if (NumDefs == 2)
2529+
AddI32.add(MI->getOperand(1));
2530+
2531+
unsigned MaterializedRegFlags =
2532+
MaterializedReg != FrameReg ? RegState::Kill : 0;
2533+
2534+
if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2535+
// If we know we have a VGPR already, it's more likely the other
2536+
// operand is a legal vsrc0.
2537+
AddI32
2538+
.add(*OtherOp)
2539+
.addReg(MaterializedReg, MaterializedRegFlags);
2540+
} else {
2541+
// Commute operands to avoid violating VOP2 restrictions. This will
2542+
// typically happen when using scratch.
2543+
AddI32
2544+
.addReg(MaterializedReg, MaterializedRegFlags)
2545+
.add(*OtherOp);
2546+
}
2547+
2548+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2549+
MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2550+
AddI32.addImm(0); // clamp
2551+
2552+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2553+
AddI32.setOperandDead(3); // Dead vcc
2554+
2555+
MaterializedReg = DstReg;
2556+
2557+
OtherOp->ChangeToRegister(MaterializedReg, false);
2558+
OtherOp->setIsKill(true);
2559+
FIOp->ChangeToImmediate(Offset);
2560+
Offset = 0;
2561+
} else if (Offset != 0) {
2562+
assert(!MaterializedReg);
2563+
FIOp->ChangeToImmediate(Offset);
2564+
Offset = 0;
2565+
} else {
2566+
if (DeadVCC && !HasClamp) {
2567+
assert(Offset == 0);
2568+
2569+
// TODO: Losing kills and implicit operands. Just mutate to copy and
2570+
// let lowerCopy deal with it?
2571+
if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2572+
// Folded to an identity copy.
2573+
MI->eraseFromParent();
2574+
return true;
2575+
}
2576+
2577+
// The immediate value should be in OtherOp
2578+
MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2579+
MI->removeOperand(FIOperandNum);
2580+
2581+
unsigned NumOps = MI->getNumOperands();
2582+
for (unsigned I = NumOps - 2; I >= 2; --I)
2583+
MI->removeOperand(I);
2584+
2585+
if (NumDefs == 2)
2586+
MI->removeOperand(1);
2587+
2588+
// The code below can't deal with a mov.
2589+
return true;
2590+
}
2591+
2592+
// This folded to a constant, but we have to keep the add around for
2593+
// pointless implicit defs or clamp modifier.
2594+
FIOp->ChangeToImmediate(0);
2595+
}
2596+
2597+
// Try to improve legality by commuting.
2598+
if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2599+
std::swap(FIOp, OtherOp);
2600+
std::swap(FIOperandNum, OtherOpIdx);
2601+
}
2602+
2603+
for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2604+
// Depending on operand constraints we may need to insert another copy.
2605+
if (!TII->isOperandLegal(*MI, SrcIdx)) {
2606+
// If commuting didn't make the operands legal, we need to materialize
2607+
// in a register.
2608+
// TODO: Can use SGPR on gfx10+ in some cases.
2609+
if (!ScavengedVGPR) {
2610+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2611+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2612+
/*SPAdj=*/0);
2613+
}
2614+
2615+
assert(ScavengedVGPR != DstReg);
2616+
2617+
MachineOperand &Src = MI->getOperand(SrcIdx);
2618+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2619+
.add(Src);
2620+
2621+
Src.ChangeToRegister(ScavengedVGPR, false);
2622+
Src.setIsKill(true);
2623+
}
2624+
}
2625+
2626+
// Fold out add of 0 case that can appear in kernels.
2627+
if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2628+
if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2629+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2630+
}
2631+
2632+
MI->eraseFromParent();
2633+
}
2634+
2635+
return true;
2636+
}
24352637
case AMDGPU::S_ADD_I32: {
24362638
// TODO: Handle s_or_b32, s_and_b32.
24372639
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
@@ -2495,32 +2697,32 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24952697
OtherOp.ChangeToRegister(MaterializedReg, false);
24962698
OtherOp.setIsKill(true);
24972699
OtherOp.setIsRenamable(true);
2498-
FIOp.ChangeToImmediate(Offset);
2700+
FIOp->ChangeToImmediate(Offset);
24992701
} else if (!OtherOp.isImm() && !MaterializedReg) {
2500-
FIOp.ChangeToImmediate(Offset);
2702+
FIOp->ChangeToImmediate(Offset);
25012703
} else {
25022704
assert(Offset == 0);
25032705

25042706
if (MaterializedReg)
2505-
FIOp.ChangeToRegister(MaterializedReg, false);
2707+
FIOp->ChangeToRegister(MaterializedReg, false);
25062708
else
2507-
FIOp.ChangeToImmediate(0);
2709+
FIOp->ChangeToImmediate(0);
25082710
}
25092711

25102712
if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
25112713
assert(Offset == 0);
25122714
MI->removeOperand(3);
25132715
MI->removeOperand(OtherOpIdx);
2514-
MI->setDesc(TII->get(FIOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2515-
} else if (DeadSCC && FIOp.isImm() && FIOp.getImm() == 0) {
2716+
MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2717+
} else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
25162718
assert(Offset == 0);
25172719
MI->removeOperand(3);
25182720
MI->removeOperand(FIOperandNum);
25192721
MI->setDesc(
25202722
TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
25212723
}
25222724

2523-
assert(!FIOp.isFI());
2725+
assert(!FIOp->isFI());
25242726

25252727
return true;
25262728
}
@@ -2537,7 +2739,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25372739

25382740
// The offset is always swizzled, just replace it
25392741
if (FrameReg)
2540-
FIOp.ChangeToRegister(FrameReg, false);
2742+
FIOp->ChangeToRegister(FrameReg, false);
25412743

25422744
MachineOperand *OffsetOp =
25432745
TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
@@ -2590,18 +2792,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25902792
}
25912793

25922794
if (!FrameReg) {
2593-
FIOp.ChangeToImmediate(Offset);
2594-
if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
2795+
FIOp->ChangeToImmediate(Offset);
2796+
if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
25952797
return false;
25962798
}
25972799

25982800
// We need to use register here. Check if we can use an SGPR or need
25992801
// a VGPR.
2600-
FIOp.ChangeToRegister(AMDGPU::M0, false);
2601-
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
2802+
FIOp->ChangeToRegister(AMDGPU::M0, false);
2803+
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
26022804

26032805
if (!Offset && FrameReg && UseSGPR) {
2604-
FIOp.setReg(FrameReg);
2806+
FIOp->setReg(FrameReg);
26052807
return false;
26062808
}
26072809

@@ -2610,8 +2812,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26102812

26112813
Register TmpReg =
26122814
RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2613-
FIOp.setReg(TmpReg);
2614-
FIOp.setIsKill();
2815+
FIOp->setReg(TmpReg);
2816+
FIOp->setIsKill();
26152817

26162818
if ((!FrameReg || !Offset) && TmpReg) {
26172819
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2640,8 +2842,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26402842
if (!TmpSReg) {
26412843
// Use frame register and restore it after.
26422844
TmpSReg = FrameReg;
2643-
FIOp.setReg(FrameReg);
2644-
FIOp.setIsKill(false);
2845+
FIOp->setReg(FrameReg);
2846+
FIOp->setIsKill(false);
26452847
}
26462848

26472849
if (NeedSaveSCC) {
@@ -2889,7 +3091,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
28893091
MI->eraseFromParent();
28903092
return true;
28913093
}
2892-
FIOp.ChangeToRegister(ResultReg, false, false, true);
3094+
FIOp->ChangeToRegister(ResultReg, false, false, true);
28933095
return false;
28943096
}
28953097

@@ -2920,13 +3122,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
29203122
// If the offset is simply too big, don't convert to a scratch wave offset
29213123
// relative index.
29223124

2923-
FIOp.ChangeToImmediate(Offset);
2924-
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
3125+
FIOp->ChangeToImmediate(Offset);
3126+
if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
29253127
Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
29263128
MI, false, 0);
29273129
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
29283130
.addImm(Offset);
2929-
FIOp.ChangeToRegister(TmpReg, false, false, true);
3131+
FIOp->ChangeToRegister(TmpReg, false, false, true);
29303132
}
29313133
}
29323134
}

0 commit comments

Comments
 (0)