Skip to content

Commit bdff9b1

Browse files
committed
AMDGPU: Handle v_add* in eliminateFrameIndex
1 parent 09ba83b commit bdff9b1

15 files changed

+880
-984
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 227 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2270,7 +2270,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22702270
assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
22712271
"unreserved scratch RSRC register");
22722272

2273-
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
2273+
MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
22742274
int Index = MI->getOperand(FIOperandNum).getIndex();
22752275

22762276
Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
@@ -2452,6 +2452,211 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24522452
MI->eraseFromParent();
24532453
return true;
24542454
}
2455+
case AMDGPU::V_ADD_U32_e32:
2456+
case AMDGPU::V_ADD_U32_e64:
2457+
case AMDGPU::V_ADD_CO_U32_e32:
2458+
case AMDGPU::V_ADD_CO_U32_e64: {
2459+
// TODO: Handle sub, and, or.
2460+
unsigned NumDefs = MI->getNumExplicitDefs();
2461+
unsigned Src0Idx = NumDefs;
2462+
2463+
bool HasClamp = false;
2464+
MachineOperand *VCCOp = nullptr;
2465+
2466+
switch (MI->getOpcode()) {
2467+
case AMDGPU::V_ADD_U32_e32:
2468+
break;
2469+
case AMDGPU::V_ADD_U32_e64:
2470+
HasClamp = MI->getOperand(3).getImm();
2471+
break;
2472+
case AMDGPU::V_ADD_CO_U32_e32:
2473+
VCCOp = &MI->getOperand(3);
2474+
break;
2475+
case AMDGPU::V_ADD_CO_U32_e64:
2476+
VCCOp = &MI->getOperand(1);
2477+
HasClamp = MI->getOperand(4).getImm();
2478+
break;
2479+
default:
2480+
break;
2481+
}
2482+
bool DeadVCC = !VCCOp || VCCOp->isDead();
2483+
MachineOperand &DstOp = MI->getOperand(0);
2484+
Register DstReg = DstOp.getReg();
2485+
2486+
unsigned OtherOpIdx =
2487+
FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2488+
MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2489+
2490+
unsigned Src1Idx = Src0Idx + 1;
2491+
Register MaterializedReg = FrameReg;
2492+
Register ScavengedVGPR;
2493+
2494+
if (FrameReg && !ST.enableFlatScratch()) {
2495+
// We should just do an in-place update of the result register. However,
2496+
// the value there may also be used by the add, in which case we need a
2497+
// temporary register.
2498+
//
2499+
// FIXME: The scavenger is not finding the result register in the
2500+
// common case where the add does not read the register.
2501+
2502+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2503+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2504+
2505+
// TODO: If we have a free SGPR, it's sometimes better to use a scalar
2506+
// shift.
2507+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2508+
.addDef(ScavengedVGPR, RegState::Renamable)
2509+
.addImm(ST.getWavefrontSizeLog2())
2510+
.addReg(FrameReg);
2511+
MaterializedReg = ScavengedVGPR;
2512+
}
2513+
2514+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2515+
// For the non-immediate case, we could fall through to the default
2516+
// handling, but we do an in-place update of the result register here to
2517+
// avoid scavenging another register.
2518+
if (OtherOp->isImm()) {
2519+
OtherOp->setImm(OtherOp->getImm() + Offset);
2520+
Offset = 0;
2521+
}
2522+
2523+
if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2524+
if (ST.enableFlatScratch() &&
2525+
!TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2526+
// We didn't need the shift above, so we have an SGPR for the frame
2527+
// register, but may have a VGPR only operand.
2528+
//
2529+
// TODO: On gfx10+, we can easily change the opcode to the e64 version
2530+
// and use the higher constant bus restriction to avoid this copy.
2531+
2532+
if (!ScavengedVGPR) {
2533+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2534+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2535+
/*SPAdj=*/0);
2536+
}
2537+
2538+
assert(ScavengedVGPR != DstReg);
2539+
2540+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2541+
.addReg(MaterializedReg,
2542+
MaterializedReg != FrameReg ? RegState::Kill : 0);
2543+
MaterializedReg = ScavengedVGPR;
2544+
}
2545+
2546+
// TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2547+
// is not live, we could use a scalar add + vector add instead of 2
2548+
// vector adds.
2549+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2550+
.addDef(DstReg, RegState::Renamable);
2551+
if (NumDefs == 2)
2552+
AddI32.add(MI->getOperand(1));
2553+
2554+
unsigned MaterializedRegFlags =
2555+
MaterializedReg != FrameReg ? RegState::Kill : 0;
2556+
2557+
if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2558+
// If we know we have a VGPR already, it's more likely the other
2559+
// operand is a legal vsrc0.
2560+
AddI32
2561+
.add(*OtherOp)
2562+
.addReg(MaterializedReg, MaterializedRegFlags);
2563+
} else {
2564+
// Commute operands to avoid violating VOP2 restrictions. This will
2565+
// typically happen when using scratch.
2566+
AddI32
2567+
.addReg(MaterializedReg, MaterializedRegFlags)
2568+
.add(*OtherOp);
2569+
}
2570+
2571+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2572+
MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2573+
AddI32.addImm(0); // clamp
2574+
2575+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2576+
AddI32.setOperandDead(3); // Dead vcc
2577+
2578+
MaterializedReg = DstReg;
2579+
2580+
OtherOp->ChangeToRegister(MaterializedReg, false);
2581+
OtherOp->setIsKill(true);
2582+
FIOp->ChangeToImmediate(Offset);
2583+
Offset = 0;
2584+
} else if (Offset != 0) {
2585+
assert(!MaterializedReg);
2586+
FIOp->ChangeToImmediate(Offset);
2587+
Offset = 0;
2588+
} else {
2589+
if (DeadVCC && !HasClamp) {
2590+
assert(Offset == 0);
2591+
2592+
// TODO: Losing kills and implicit operands. Just mutate to copy and
2593+
// let lowerCopy deal with it?
2594+
if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2595+
// Folded to an identity copy.
2596+
MI->eraseFromParent();
2597+
return true;
2598+
}
2599+
2600+
// The immediate value should be in OtherOp
2601+
MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2602+
MI->removeOperand(FIOperandNum);
2603+
2604+
unsigned NumOps = MI->getNumOperands();
2605+
for (unsigned I = NumOps - 2; I >= 2; --I)
2606+
MI->removeOperand(I);
2607+
2608+
if (NumDefs == 2)
2609+
MI->removeOperand(1);
2610+
2611+
// The code below can't deal with a mov.
2612+
return true;
2613+
}
2614+
2615+
// This folded to a constant, but we have to keep the add around for
2616+
// pointless implicit defs or clamp modifier.
2617+
FIOp->ChangeToImmediate(0);
2618+
}
2619+
2620+
// Try to improve legality by commuting.
2621+
if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2622+
std::swap(FIOp, OtherOp);
2623+
std::swap(FIOperandNum, OtherOpIdx);
2624+
}
2625+
2626+
for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2627+
// Depending on operand constraints we may need to insert another copy.
2628+
if (!TII->isOperandLegal(*MI, SrcIdx)) {
2629+
// If commuting didn't make the operands legal, we need to materialize
2630+
// in a register.
2631+
// TODO: Can use SGPR on gfx10+ in some cases.
2632+
if (!ScavengedVGPR) {
2633+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2634+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2635+
/*SPAdj=*/0);
2636+
}
2637+
2638+
assert(ScavengedVGPR != DstReg);
2639+
2640+
MachineOperand &Src = MI->getOperand(SrcIdx);
2641+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2642+
.add(Src);
2643+
2644+
Src.ChangeToRegister(ScavengedVGPR, false);
2645+
Src.setIsKill(true);
2646+
}
2647+
}
2648+
2649+
// Fold out add of 0 case that can appear in kernels.
2650+
if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2651+
if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2652+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2653+
}
2654+
2655+
MI->eraseFromParent();
2656+
}
2657+
2658+
return true;
2659+
}
24552660
case AMDGPU::S_ADD_I32: {
24562661
// TODO: Handle s_or_b32, s_and_b32.
24572662
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
@@ -2492,9 +2697,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24922697
Offset = 0;
24932698

24942699
if (MaterializedReg)
2495-
FIOp.ChangeToRegister(MaterializedReg, false);
2700+
FIOp->ChangeToRegister(MaterializedReg, false);
24962701
else
2497-
FIOp.ChangeToImmediate(0);
2702+
FIOp->ChangeToImmediate(0);
24982703
} else if (MaterializedReg) {
24992704
// If we can't fold the other operand, do another increment.
25002705
Register DstReg = DstOp.getReg();
@@ -2517,27 +2722,27 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25172722
OtherOp.ChangeToRegister(MaterializedReg, false);
25182723
OtherOp.setIsKill(true);
25192724
OtherOp.setIsRenamable(true);
2520-
FIOp.ChangeToImmediate(Offset);
2725+
FIOp->ChangeToImmediate(Offset);
25212726
} else {
25222727
// If we don't have any other offset to apply, we can just directly
25232728
// interpret the frame index as the offset.
2524-
FIOp.ChangeToImmediate(Offset);
2729+
FIOp->ChangeToImmediate(Offset);
25252730
}
25262731

25272732
if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
25282733
assert(Offset == 0);
25292734
MI->removeOperand(3);
25302735
MI->removeOperand(OtherOpIdx);
2531-
MI->setDesc(TII->get(FIOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2532-
} else if (DeadSCC && FIOp.isImm() && FIOp.getImm() == 0) {
2736+
MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2737+
} else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
25332738
assert(Offset == 0);
25342739
MI->removeOperand(3);
25352740
MI->removeOperand(FIOperandNum);
25362741
MI->setDesc(
25372742
TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
25382743
}
25392744

2540-
assert(!FIOp.isFI());
2745+
assert(!FIOp->isFI());
25412746
return true;
25422747
}
25432748
default: {
@@ -2553,7 +2758,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25532758

25542759
// The offset is always swizzled, just replace it
25552760
if (FrameReg)
2556-
FIOp.ChangeToRegister(FrameReg, false);
2761+
FIOp->ChangeToRegister(FrameReg, false);
25572762

25582763
MachineOperand *OffsetOp =
25592764
TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
@@ -2606,18 +2811,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26062811
}
26072812

26082813
if (!FrameReg) {
2609-
FIOp.ChangeToImmediate(Offset);
2610-
if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
2814+
FIOp->ChangeToImmediate(Offset);
2815+
if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
26112816
return false;
26122817
}
26132818

26142819
// We need to use register here. Check if we can use an SGPR or need
26152820
// a VGPR.
2616-
FIOp.ChangeToRegister(AMDGPU::M0, false);
2617-
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
2821+
FIOp->ChangeToRegister(AMDGPU::M0, false);
2822+
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
26182823

26192824
if (!Offset && FrameReg && UseSGPR) {
2620-
FIOp.setReg(FrameReg);
2825+
FIOp->setReg(FrameReg);
26212826
return false;
26222827
}
26232828

@@ -2626,8 +2831,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26262831

26272832
Register TmpReg =
26282833
RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2629-
FIOp.setReg(TmpReg);
2630-
FIOp.setIsKill();
2834+
FIOp->setReg(TmpReg);
2835+
FIOp->setIsKill();
26312836

26322837
if ((!FrameReg || !Offset) && TmpReg) {
26332838
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2656,8 +2861,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26562861
if (!TmpSReg) {
26572862
// Use frame register and restore it after.
26582863
TmpSReg = FrameReg;
2659-
FIOp.setReg(FrameReg);
2660-
FIOp.setIsKill(false);
2864+
FIOp->setReg(FrameReg);
2865+
FIOp->setIsKill(false);
26612866
}
26622867

26632868
if (NeedSaveSCC) {
@@ -2905,7 +3110,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
29053110
MI->eraseFromParent();
29063111
return true;
29073112
}
2908-
FIOp.ChangeToRegister(ResultReg, false, false, true);
3113+
FIOp->ChangeToRegister(ResultReg, false, false, true);
29093114
return false;
29103115
}
29113116

@@ -2936,13 +3141,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
29363141
// If the offset is simply too big, don't convert to a scratch wave offset
29373142
// relative index.
29383143

2939-
FIOp.ChangeToImmediate(Offset);
2940-
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
3144+
FIOp->ChangeToImmediate(Offset);
3145+
if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
29413146
Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
29423147
MI, false, 0);
29433148
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
29443149
.addImm(Offset);
2945-
FIOp.ChangeToRegister(TmpReg, false, false, true);
3150+
FIOp->ChangeToRegister(TmpReg, false, false, true);
29463151
}
29473152
}
29483153
}

0 commit comments

Comments
 (0)