Skip to content

Commit af30ca7

Browse files
committed
[WIP] AMDGPU: Handle v_add* in eliminateFrameIndex
1 parent b9a85ce commit af30ca7

14 files changed

+802
-902
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 224 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2250,7 +2250,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22502250
assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
22512251
"unreserved scratch RSRC register");
22522252

2253-
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
2253+
MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
22542254
int Index = MI->getOperand(FIOperandNum).getIndex();
22552255

22562256
Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
@@ -2432,6 +2432,208 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24322432
MI->eraseFromParent();
24332433
return true;
24342434
}
2435+
case AMDGPU::V_ADD_U32_e32:
2436+
case AMDGPU::V_ADD_U32_e64:
2437+
case AMDGPU::V_ADD_CO_U32_e32:
2438+
case AMDGPU::V_ADD_CO_U32_e64: {
2439+
// TODO: Handle sub, and, or.
2440+
unsigned NumDefs = MI->getNumExplicitDefs();
2441+
unsigned Src0Idx = NumDefs;
2442+
2443+
bool HasClamp = false;
2444+
MachineOperand *VCCOp = nullptr;
2445+
2446+
switch (MI->getOpcode()) {
2447+
case AMDGPU::V_ADD_U32_e32:
2448+
break;
2449+
case AMDGPU::V_ADD_U32_e64:
2450+
HasClamp = MI->getOperand(3).getImm();
2451+
break;
2452+
case AMDGPU::V_ADD_CO_U32_e32:
2453+
VCCOp = &MI->getOperand(3);
2454+
break;
2455+
case AMDGPU::V_ADD_CO_U32_e64:
2456+
VCCOp = &MI->getOperand(1);
2457+
HasClamp = MI->getOperand(4).getImm();
2458+
break;
2459+
default:
2460+
break;
2461+
}
2462+
bool DeadVCC = !VCCOp || VCCOp->isDead();
2463+
MachineOperand &DstOp = MI->getOperand(0);
2464+
Register DstReg = DstOp.getReg();
2465+
2466+
unsigned OtherOpIdx =
2467+
FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2468+
MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2469+
2470+
unsigned Src1Idx = Src0Idx + 1;
2471+
Register MaterializedReg = FrameReg;
2472+
Register ScavengedVGPR;
2473+
2474+
if (FrameReg && !ST.enableFlatScratch()) {
2475+
// We should just do an in-place update of the result register. However,
2476+
// the value there may also be used by the add, in which case we need a
2477+
// temporary register.
2478+
//
2479+
// FIXME: The scavenger is not finding the result register in the
2480+
// common case where the add does not read the register.
2481+
2482+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2483+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2484+
2485+
// TODO: If we have a free SGPR, it's sometimes better to use a scalar
2486+
// shift.
2487+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2488+
.addDef(ScavengedVGPR, RegState::Renamable)
2489+
.addImm(ST.getWavefrontSizeLog2())
2490+
.addReg(FrameReg);
2491+
MaterializedReg = ScavengedVGPR;
2492+
}
2493+
2494+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2495+
// For the non-immediate case, we could fall through to the default
2496+
// handling, but we do an in-place update of the result register here to
2497+
// avoid scavenging another register.
2498+
if (OtherOp->isImm()) {
2499+
OtherOp->setImm(OtherOp->getImm() + Offset);
2500+
Offset = 0;
2501+
}
2502+
2503+
if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2504+
if (ST.enableFlatScratch() &&
2505+
!TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2506+
// We didn't need the shift above, so we have an SGPR for the frame
2507+
// register, but may have a VGPR only operand.
2508+
//
2509+
// TODO: On gfx10+, we can easily change the opcode to the e64 version
2510+
// and use the higher constant bus restriction to avoid this copy.
2511+
2512+
if (!ScavengedVGPR) {
2513+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2514+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2515+
/*SPAdj=*/0);
2516+
}
2517+
2518+
assert(ScavengedVGPR != DstReg);
2519+
2520+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2521+
.addReg(MaterializedReg,
2522+
MaterializedReg != FrameReg ? RegState::Kill : 0);
2523+
MaterializedReg = ScavengedVGPR;
2524+
}
2525+
2526+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2527+
.addDef(DstReg, RegState::Renamable);
2528+
if (NumDefs == 2)
2529+
AddI32.add(MI->getOperand(1));
2530+
2531+
unsigned MaterializedRegFlags =
2532+
MaterializedReg != FrameReg ? RegState::Kill : 0;
2533+
2534+
if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2535+
// If we know we have a VGPR already, it's more likely the other
2536+
// operand is a legal vsrc0.
2537+
AddI32
2538+
.add(*OtherOp)
2539+
.addReg(MaterializedReg, MaterializedRegFlags);
2540+
} else {
2541+
// Commute operands to avoid violating VOP2 restrictions. This will
2542+
// typically happen when using scratch.
2543+
AddI32
2544+
.addReg(MaterializedReg, MaterializedRegFlags)
2545+
.add(*OtherOp);
2546+
}
2547+
2548+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2549+
MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2550+
AddI32.addImm(0); // clamp
2551+
2552+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2553+
AddI32.setOperandDead(3); // Dead vcc
2554+
2555+
MaterializedReg = DstReg;
2556+
2557+
OtherOp->ChangeToRegister(MaterializedReg, false);
2558+
OtherOp->setIsKill(true);
2559+
FIOp->ChangeToImmediate(Offset);
2560+
Offset = 0;
2561+
} else if (Offset != 0) {
2562+
assert(!MaterializedReg);
2563+
FIOp->ChangeToImmediate(Offset);
2564+
Offset = 0;
2565+
} else {
2566+
if (DeadVCC && !HasClamp) {
2567+
assert(Offset == 0);
2568+
2569+
// TODO: Losing kills and implicit operands. Just mutate to copy and
2570+
// let lowerCopy deal with it?
2571+
if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2572+
// Folded to an identity copy.
2573+
MI->eraseFromParent();
2574+
return true;
2575+
}
2576+
2577+
// The immediate value should be in OtherOp
2578+
MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2579+
MI->removeOperand(FIOperandNum);
2580+
2581+
unsigned NumOps = MI->getNumOperands();
2582+
for (unsigned I = NumOps - 2; I >= 2; --I)
2583+
MI->removeOperand(I);
2584+
2585+
if (NumDefs == 2)
2586+
MI->removeOperand(1);
2587+
2588+
// The code below can't deal with a mov.
2589+
return true;
2590+
}
2591+
2592+
// This folded to a constant, but we have to keep the add around for
2593+
// pointless implicit defs or clamp modifier.
2594+
FIOp->ChangeToImmediate(0);
2595+
}
2596+
2597+
// Try to improve legality by commuting.
2598+
if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2599+
std::swap(FIOp, OtherOp);
2600+
std::swap(FIOperandNum, OtherOpIdx);
2601+
}
2602+
2603+
for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2604+
// Depending on operand constraints we may need to insert another copy.
2605+
if (!TII->isOperandLegal(*MI, SrcIdx)) {
2606+
// If commuting didn't make the operands legal, we need to materialize
2607+
// in a register.
2608+
// TODO: Can use SGPR on gfx10+ in some cases.
2609+
if (!ScavengedVGPR) {
2610+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2611+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2612+
/*SPAdj=*/0);
2613+
}
2614+
2615+
assert(ScavengedVGPR != DstReg);
2616+
2617+
MachineOperand &Src = MI->getOperand(SrcIdx);
2618+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2619+
.add(Src);
2620+
2621+
Src.ChangeToRegister(ScavengedVGPR, false);
2622+
Src.setIsKill(true);
2623+
}
2624+
}
2625+
2626+
// Fold out add of 0 case that can appear in kernels.
2627+
if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2628+
if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2629+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2630+
}
2631+
2632+
MI->eraseFromParent();
2633+
}
2634+
2635+
return true;
2636+
}
24352637
case AMDGPU::S_ADD_I32: {
24362638
// TODO: Handle s_or_b32, s_and_b32.
24372639
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
@@ -2472,9 +2674,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24722674
Offset = 0;
24732675

24742676
if (MaterializedReg)
2475-
FIOp.ChangeToRegister(MaterializedReg, false);
2677+
FIOp->ChangeToRegister(MaterializedReg, false);
24762678
else
2477-
FIOp.ChangeToImmediate(0);
2679+
FIOp->ChangeToImmediate(0);
24782680
} else if (MaterializedReg) {
24792681
// If we can't fold the other operand, do another increment.
24802682
Register DstReg = DstOp.getReg();
@@ -2497,27 +2699,27 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24972699
OtherOp.ChangeToRegister(MaterializedReg, false);
24982700
OtherOp.setIsKill(true);
24992701
OtherOp.setIsRenamable(true);
2500-
FIOp.ChangeToImmediate(Offset);
2702+
FIOp->ChangeToImmediate(Offset);
25012703
} else {
25022704
// If we don't have any other offset to apply, we can just directly
25032705
// interpret the frame index as the offset.
2504-
FIOp.ChangeToImmediate(Offset);
2706+
FIOp->ChangeToImmediate(Offset);
25052707
}
25062708

25072709
if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
25082710
assert(Offset == 0);
25092711
MI->removeOperand(3);
25102712
MI->removeOperand(OtherOpIdx);
2511-
MI->setDesc(TII->get(FIOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2512-
} else if (DeadSCC && FIOp.isImm() && FIOp.getImm() == 0) {
2713+
MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2714+
} else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
25132715
assert(Offset == 0);
25142716
MI->removeOperand(3);
25152717
MI->removeOperand(FIOperandNum);
25162718
MI->setDesc(
25172719
TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
25182720
}
25192721

2520-
assert(!FIOp.isFI());
2722+
assert(!FIOp->isFI());
25212723
return true;
25222724
}
25232725
default: {
@@ -2533,7 +2735,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25332735

25342736
// The offset is always swizzled, just replace it
25352737
if (FrameReg)
2536-
FIOp.ChangeToRegister(FrameReg, false);
2738+
FIOp->ChangeToRegister(FrameReg, false);
25372739

25382740
MachineOperand *OffsetOp =
25392741
TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
@@ -2586,18 +2788,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25862788
}
25872789

25882790
if (!FrameReg) {
2589-
FIOp.ChangeToImmediate(Offset);
2590-
if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
2791+
FIOp->ChangeToImmediate(Offset);
2792+
if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
25912793
return false;
25922794
}
25932795

25942796
// We need to use register here. Check if we can use an SGPR or need
25952797
// a VGPR.
2596-
FIOp.ChangeToRegister(AMDGPU::M0, false);
2597-
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
2798+
FIOp->ChangeToRegister(AMDGPU::M0, false);
2799+
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
25982800

25992801
if (!Offset && FrameReg && UseSGPR) {
2600-
FIOp.setReg(FrameReg);
2802+
FIOp->setReg(FrameReg);
26012803
return false;
26022804
}
26032805

@@ -2606,8 +2808,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26062808

26072809
Register TmpReg =
26082810
RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2609-
FIOp.setReg(TmpReg);
2610-
FIOp.setIsKill();
2811+
FIOp->setReg(TmpReg);
2812+
FIOp->setIsKill();
26112813

26122814
if ((!FrameReg || !Offset) && TmpReg) {
26132815
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2636,8 +2838,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26362838
if (!TmpSReg) {
26372839
// Use frame register and restore it after.
26382840
TmpSReg = FrameReg;
2639-
FIOp.setReg(FrameReg);
2640-
FIOp.setIsKill(false);
2841+
FIOp->setReg(FrameReg);
2842+
FIOp->setIsKill(false);
26412843
}
26422844

26432845
if (NeedSaveSCC) {
@@ -2885,7 +3087,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
28853087
MI->eraseFromParent();
28863088
return true;
28873089
}
2888-
FIOp.ChangeToRegister(ResultReg, false, false, true);
3090+
FIOp->ChangeToRegister(ResultReg, false, false, true);
28893091
return false;
28903092
}
28913093

@@ -2916,13 +3118,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
29163118
// If the offset is simply too big, don't convert to a scratch wave offset
29173119
// relative index.
29183120

2919-
FIOp.ChangeToImmediate(Offset);
2920-
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
3121+
FIOp->ChangeToImmediate(Offset);
3122+
if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
29213123
Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
29223124
MI, false, 0);
29233125
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
29243126
.addImm(Offset);
2925-
FIOp.ChangeToRegister(TmpReg, false, false, true);
3127+
FIOp->ChangeToRegister(TmpReg, false, false, true);
29263128
}
29273129
}
29283130
}

0 commit comments

Comments
 (0)