Skip to content

Commit dd532ee

Browse files
committed
AMDGPU: Handle v_add* in eliminateFrameIndex
1 parent 8e0daab commit dd532ee

15 files changed

+846
-942
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 227 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2250,7 +2250,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22502250
assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
22512251
"unreserved scratch RSRC register");
22522252

2253-
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
2253+
MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
22542254
int Index = MI->getOperand(FIOperandNum).getIndex();
22552255

22562256
Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
@@ -2432,6 +2432,211 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24322432
MI->eraseFromParent();
24332433
return true;
24342434
}
2435+
case AMDGPU::V_ADD_U32_e32:
2436+
case AMDGPU::V_ADD_U32_e64:
2437+
case AMDGPU::V_ADD_CO_U32_e32:
2438+
case AMDGPU::V_ADD_CO_U32_e64: {
2439+
// TODO: Handle sub, and, or.
2440+
unsigned NumDefs = MI->getNumExplicitDefs();
2441+
unsigned Src0Idx = NumDefs;
2442+
2443+
bool HasClamp = false;
2444+
MachineOperand *VCCOp = nullptr;
2445+
2446+
switch (MI->getOpcode()) {
2447+
case AMDGPU::V_ADD_U32_e32:
2448+
break;
2449+
case AMDGPU::V_ADD_U32_e64:
2450+
HasClamp = MI->getOperand(3).getImm();
2451+
break;
2452+
case AMDGPU::V_ADD_CO_U32_e32:
2453+
VCCOp = &MI->getOperand(3);
2454+
break;
2455+
case AMDGPU::V_ADD_CO_U32_e64:
2456+
VCCOp = &MI->getOperand(1);
2457+
HasClamp = MI->getOperand(4).getImm();
2458+
break;
2459+
default:
2460+
break;
2461+
}
2462+
bool DeadVCC = !VCCOp || VCCOp->isDead();
2463+
MachineOperand &DstOp = MI->getOperand(0);
2464+
Register DstReg = DstOp.getReg();
2465+
2466+
unsigned OtherOpIdx =
2467+
FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2468+
MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2469+
2470+
unsigned Src1Idx = Src0Idx + 1;
2471+
Register MaterializedReg = FrameReg;
2472+
Register ScavengedVGPR;
2473+
2474+
if (FrameReg && !ST.enableFlatScratch()) {
2475+
// We should just do an in-place update of the result register. However,
2476+
// the value there may also be used by the add, in which case we need a
2477+
// temporary register.
2478+
//
2479+
// FIXME: The scavenger is not finding the result register in the
2480+
// common case where the add does not read the register.
2481+
2482+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2483+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2484+
2485+
// TODO: If we have a free SGPR, it's sometimes better to use a scalar
2486+
// shift.
2487+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2488+
.addDef(ScavengedVGPR, RegState::Renamable)
2489+
.addImm(ST.getWavefrontSizeLog2())
2490+
.addReg(FrameReg);
2491+
MaterializedReg = ScavengedVGPR;
2492+
}
2493+
2494+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2495+
// For the non-immediate case, we could fall through to the default
2496+
// handling, but we do an in-place update of the result register here to
2497+
// avoid scavenging another register.
2498+
if (OtherOp->isImm()) {
2499+
OtherOp->setImm(OtherOp->getImm() + Offset);
2500+
Offset = 0;
2501+
}
2502+
2503+
if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2504+
if (ST.enableFlatScratch() &&
2505+
!TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2506+
// We didn't need the shift above, so we have an SGPR for the frame
2507+
// register, but may have a VGPR only operand.
2508+
//
2509+
// TODO: On gfx10+, we can easily change the opcode to the e64 version
2510+
// and use the higher constant bus restriction to avoid this copy.
2511+
2512+
if (!ScavengedVGPR) {
2513+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2514+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2515+
/*SPAdj=*/0);
2516+
}
2517+
2518+
assert(ScavengedVGPR != DstReg);
2519+
2520+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2521+
.addReg(MaterializedReg,
2522+
MaterializedReg != FrameReg ? RegState::Kill : 0);
2523+
MaterializedReg = ScavengedVGPR;
2524+
}
2525+
2526+
// TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2527+
// is not live, we could use a scalar add + vector add instead of 2
2528+
// vector adds.
2529+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2530+
.addDef(DstReg, RegState::Renamable);
2531+
if (NumDefs == 2)
2532+
AddI32.add(MI->getOperand(1));
2533+
2534+
unsigned MaterializedRegFlags =
2535+
MaterializedReg != FrameReg ? RegState::Kill : 0;
2536+
2537+
if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2538+
// If we know we have a VGPR already, it's more likely the other
2539+
// operand is a legal vsrc0.
2540+
AddI32
2541+
.add(*OtherOp)
2542+
.addReg(MaterializedReg, MaterializedRegFlags);
2543+
} else {
2544+
// Commute operands to avoid violating VOP2 restrictions. This will
2545+
// typically happen when using scratch.
2546+
AddI32
2547+
.addReg(MaterializedReg, MaterializedRegFlags)
2548+
.add(*OtherOp);
2549+
}
2550+
2551+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2552+
MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2553+
AddI32.addImm(0); // clamp
2554+
2555+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2556+
AddI32.setOperandDead(3); // Dead vcc
2557+
2558+
MaterializedReg = DstReg;
2559+
2560+
OtherOp->ChangeToRegister(MaterializedReg, false);
2561+
OtherOp->setIsKill(true);
2562+
FIOp->ChangeToImmediate(Offset);
2563+
Offset = 0;
2564+
} else if (Offset != 0) {
2565+
assert(!MaterializedReg);
2566+
FIOp->ChangeToImmediate(Offset);
2567+
Offset = 0;
2568+
} else {
2569+
if (DeadVCC && !HasClamp) {
2570+
assert(Offset == 0);
2571+
2572+
// TODO: Losing kills and implicit operands. Just mutate to copy and
2573+
// let lowerCopy deal with it?
2574+
if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2575+
// Folded to an identity copy.
2576+
MI->eraseFromParent();
2577+
return true;
2578+
}
2579+
2580+
// The immediate value should be in OtherOp
2581+
MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2582+
MI->removeOperand(FIOperandNum);
2583+
2584+
unsigned NumOps = MI->getNumOperands();
2585+
for (unsigned I = NumOps - 2; I >= 2; --I)
2586+
MI->removeOperand(I);
2587+
2588+
if (NumDefs == 2)
2589+
MI->removeOperand(1);
2590+
2591+
// The code below can't deal with a mov.
2592+
return true;
2593+
}
2594+
2595+
// This folded to a constant, but we have to keep the add around for
2596+
// pointless implicit defs or clamp modifier.
2597+
FIOp->ChangeToImmediate(0);
2598+
}
2599+
2600+
// Try to improve legality by commuting.
2601+
if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2602+
std::swap(FIOp, OtherOp);
2603+
std::swap(FIOperandNum, OtherOpIdx);
2604+
}
2605+
2606+
for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2607+
// Depending on operand constraints we may need to insert another copy.
2608+
if (!TII->isOperandLegal(*MI, SrcIdx)) {
2609+
// If commuting didn't make the operands legal, we need to materialize
2610+
// in a register.
2611+
// TODO: Can use SGPR on gfx10+ in some cases.
2612+
if (!ScavengedVGPR) {
2613+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2614+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2615+
/*SPAdj=*/0);
2616+
}
2617+
2618+
assert(ScavengedVGPR != DstReg);
2619+
2620+
MachineOperand &Src = MI->getOperand(SrcIdx);
2621+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2622+
.add(Src);
2623+
2624+
Src.ChangeToRegister(ScavengedVGPR, false);
2625+
Src.setIsKill(true);
2626+
}
2627+
}
2628+
2629+
// Fold out add of 0 case that can appear in kernels.
2630+
if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2631+
if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2632+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2633+
}
2634+
2635+
MI->eraseFromParent();
2636+
}
2637+
2638+
return true;
2639+
}
24352640
case AMDGPU::S_ADD_I32: {
24362641
// TODO: Handle s_or_b32, s_and_b32.
24372642
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
@@ -2472,9 +2677,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24722677
Offset = 0;
24732678

24742679
if (MaterializedReg)
2475-
FIOp.ChangeToRegister(MaterializedReg, false);
2680+
FIOp->ChangeToRegister(MaterializedReg, false);
24762681
else
2477-
FIOp.ChangeToImmediate(0);
2682+
FIOp->ChangeToImmediate(0);
24782683
} else if (MaterializedReg) {
24792684
// If we can't fold the other operand, do another increment.
24802685
Register DstReg = DstOp.getReg();
@@ -2497,27 +2702,27 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24972702
OtherOp.ChangeToRegister(MaterializedReg, false);
24982703
OtherOp.setIsKill(true);
24992704
OtherOp.setIsRenamable(true);
2500-
FIOp.ChangeToImmediate(Offset);
2705+
FIOp->ChangeToImmediate(Offset);
25012706
} else {
25022707
// If we don't have any other offset to apply, we can just directly
25032708
// interpret the frame index as the offset.
2504-
FIOp.ChangeToImmediate(Offset);
2709+
FIOp->ChangeToImmediate(Offset);
25052710
}
25062711

25072712
if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
25082713
assert(Offset == 0);
25092714
MI->removeOperand(3);
25102715
MI->removeOperand(OtherOpIdx);
2511-
MI->setDesc(TII->get(FIOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2512-
} else if (DeadSCC && FIOp.isImm() && FIOp.getImm() == 0) {
2716+
MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2717+
} else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
25132718
assert(Offset == 0);
25142719
MI->removeOperand(3);
25152720
MI->removeOperand(FIOperandNum);
25162721
MI->setDesc(
25172722
TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
25182723
}
25192724

2520-
assert(!FIOp.isFI());
2725+
assert(!FIOp->isFI());
25212726
return true;
25222727
}
25232728
default: {
@@ -2533,7 +2738,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25332738

25342739
// The offset is always swizzled, just replace it
25352740
if (FrameReg)
2536-
FIOp.ChangeToRegister(FrameReg, false);
2741+
FIOp->ChangeToRegister(FrameReg, false);
25372742

25382743
MachineOperand *OffsetOp =
25392744
TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
@@ -2586,18 +2791,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25862791
}
25872792

25882793
if (!FrameReg) {
2589-
FIOp.ChangeToImmediate(Offset);
2590-
if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
2794+
FIOp->ChangeToImmediate(Offset);
2795+
if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
25912796
return false;
25922797
}
25932798

25942799
// We need to use register here. Check if we can use an SGPR or need
25952800
// a VGPR.
2596-
FIOp.ChangeToRegister(AMDGPU::M0, false);
2597-
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
2801+
FIOp->ChangeToRegister(AMDGPU::M0, false);
2802+
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
25982803

25992804
if (!Offset && FrameReg && UseSGPR) {
2600-
FIOp.setReg(FrameReg);
2805+
FIOp->setReg(FrameReg);
26012806
return false;
26022807
}
26032808

@@ -2606,8 +2811,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26062811

26072812
Register TmpReg =
26082813
RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2609-
FIOp.setReg(TmpReg);
2610-
FIOp.setIsKill();
2814+
FIOp->setReg(TmpReg);
2815+
FIOp->setIsKill();
26112816

26122817
if ((!FrameReg || !Offset) && TmpReg) {
26132818
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2636,8 +2841,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26362841
if (!TmpSReg) {
26372842
// Use frame register and restore it after.
26382843
TmpSReg = FrameReg;
2639-
FIOp.setReg(FrameReg);
2640-
FIOp.setIsKill(false);
2844+
FIOp->setReg(FrameReg);
2845+
FIOp->setIsKill(false);
26412846
}
26422847

26432848
if (NeedSaveSCC) {
@@ -2885,7 +3090,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
28853090
MI->eraseFromParent();
28863091
return true;
28873092
}
2888-
FIOp.ChangeToRegister(ResultReg, false, false, true);
3093+
FIOp->ChangeToRegister(ResultReg, false, false, true);
28893094
return false;
28903095
}
28913096

@@ -2916,13 +3121,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
29163121
// If the offset is simply too big, don't convert to a scratch wave offset
29173122
// relative index.
29183123

2919-
FIOp.ChangeToImmediate(Offset);
2920-
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
3124+
FIOp->ChangeToImmediate(Offset);
3125+
if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
29213126
Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
29223127
MI, false, 0);
29233128
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
29243129
.addImm(Offset);
2925-
FIOp.ChangeToRegister(TmpReg, false, false, true);
3130+
FIOp->ChangeToRegister(TmpReg, false, false, true);
29263131
}
29273132
}
29283133
}

0 commit comments

Comments
 (0)