Skip to content

Commit 46686cb

Browse files
committed
[WIP] AMDGPU: Handle v_add* in eliminateFrameIndex
1 parent ac17eed commit 46686cb

13 files changed

+795
-895
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 218 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2086,7 +2086,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
20862086
assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
20872087
"unreserved scratch RSRC register");
20882088

2089-
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
2089+
MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
20902090
int Index = MI->getOperand(FIOperandNum).getIndex();
20912091

20922092
Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
@@ -2268,6 +2268,208 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22682268
MI->eraseFromParent();
22692269
return true;
22702270
}
2271+
case AMDGPU::V_ADD_U32_e32:
2272+
case AMDGPU::V_ADD_U32_e64:
2273+
case AMDGPU::V_ADD_CO_U32_e32:
2274+
case AMDGPU::V_ADD_CO_U32_e64: {
2275+
// TODO: Handle sub, and, or.
2276+
unsigned NumDefs = MI->getNumExplicitDefs();
2277+
unsigned Src0Idx = NumDefs;
2278+
2279+
bool HasClamp = false;
2280+
MachineOperand *VCCOp = nullptr;
2281+
2282+
switch (MI->getOpcode()) {
2283+
case AMDGPU::V_ADD_U32_e32:
2284+
break;
2285+
case AMDGPU::V_ADD_U32_e64:
2286+
HasClamp = MI->getOperand(3).getImm();
2287+
break;
2288+
case AMDGPU::V_ADD_CO_U32_e32:
2289+
VCCOp = &MI->getOperand(3);
2290+
break;
2291+
case AMDGPU::V_ADD_CO_U32_e64:
2292+
VCCOp = &MI->getOperand(1);
2293+
HasClamp = MI->getOperand(4).getImm();
2294+
break;
2295+
default:
2296+
break;
2297+
}
2298+
bool DeadVCC = !VCCOp || VCCOp->isDead();
2299+
MachineOperand &DstOp = MI->getOperand(0);
2300+
Register DstReg = DstOp.getReg();
2301+
2302+
unsigned OtherOpIdx =
2303+
FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2304+
MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2305+
2306+
unsigned Src1Idx = Src0Idx + 1;
2307+
Register MaterializedReg = FrameReg;
2308+
Register ScavengedVGPR;
2309+
2310+
if (FrameReg && !ST.enableFlatScratch()) {
2311+
// We should just do an in-place update of the result register. However,
2312+
// the value there may also be used by the add, in which case we need a
2313+
// temporary register.
2314+
//
2315+
// FIXME: The scavenger is not finding the result register in the
2316+
// common case where the add does not read the register.
2317+
2318+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2319+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2320+
2321+
// TODO: If we have a free SGPR, it's sometimes better to use a scalar
2322+
// shift.
2323+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2324+
.addDef(ScavengedVGPR, RegState::Renamable)
2325+
.addImm(ST.getWavefrontSizeLog2())
2326+
.addReg(FrameReg);
2327+
MaterializedReg = ScavengedVGPR;
2328+
}
2329+
2330+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2331+
// For the non-immediate case, we could fall through to the default
2332+
// handling, but we do an in-place update of the result register here to
2333+
// avoid scavenging another register.
2334+
if (OtherOp->isImm()) {
2335+
OtherOp->setImm(OtherOp->getImm() + Offset);
2336+
Offset = 0;
2337+
}
2338+
2339+
if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2340+
if (ST.enableFlatScratch() &&
2341+
!TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2342+
// We didn't need the shift above, so we have an SGPR for the frame
2343+
// register, but may have a VGPR only operand.
2344+
//
2345+
// TODO: On gfx10+, we can easily change the opcode to the e64 version
2346+
// and use the higher constant bus restriction to avoid this copy.
2347+
2348+
if (!ScavengedVGPR) {
2349+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2350+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2351+
/*SPAdj=*/0);
2352+
}
2353+
2354+
assert(ScavengedVGPR != DstReg);
2355+
2356+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2357+
.addReg(MaterializedReg,
2358+
MaterializedReg != FrameReg ? RegState::Kill : 0);
2359+
MaterializedReg = ScavengedVGPR;
2360+
}
2361+
2362+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2363+
.addDef(DstReg, RegState::Renamable);
2364+
if (NumDefs == 2)
2365+
AddI32.add(MI->getOperand(1));
2366+
2367+
unsigned MaterializedRegFlags =
2368+
MaterializedReg != FrameReg ? RegState::Kill : 0;
2369+
2370+
if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2371+
// If we know we have a VGPR already, it's more likely the other
2372+
// operand is a legal vsrc0.
2373+
AddI32
2374+
.add(*OtherOp)
2375+
.addReg(MaterializedReg, MaterializedRegFlags);
2376+
} else {
2377+
// Commute operands to avoid violating VOP2 restrictions. This will
2378+
// typically happen when using scratch.
2379+
AddI32
2380+
.addReg(MaterializedReg, MaterializedRegFlags)
2381+
.add(*OtherOp);
2382+
}
2383+
2384+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2385+
MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2386+
AddI32.addImm(0); // clamp
2387+
2388+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2389+
AddI32.setOperandDead(3); // Dead vcc
2390+
2391+
MaterializedReg = DstReg;
2392+
2393+
OtherOp->ChangeToRegister(MaterializedReg, false);
2394+
OtherOp->setIsKill(true);
2395+
FIOp->ChangeToImmediate(Offset);
2396+
Offset = 0;
2397+
} else if (Offset != 0) {
2398+
assert(!MaterializedReg);
2399+
FIOp->ChangeToImmediate(Offset);
2400+
Offset = 0;
2401+
} else {
2402+
if (DeadVCC && !HasClamp) {
2403+
assert(Offset == 0);
2404+
2405+
// TODO: Losing kills and implicit operands. Just mutate to copy and
2406+
// let lowerCopy deal with it?
2407+
if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2408+
// Folded to an identity copy.
2409+
MI->eraseFromParent();
2410+
return true;
2411+
}
2412+
2413+
// The immediate value should be in OtherOp
2414+
MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2415+
MI->removeOperand(FIOperandNum);
2416+
2417+
unsigned NumOps = MI->getNumOperands();
2418+
for (unsigned I = NumOps - 2; I >= 2; --I)
2419+
MI->removeOperand(I);
2420+
2421+
if (NumDefs == 2)
2422+
MI->removeOperand(1);
2423+
2424+
// The code below can't deal with a mov.
2425+
return true;
2426+
}
2427+
2428+
// This folded to a constant, but we have to keep the add around for
2429+
// pointless implicit defs or clamp modifier.
2430+
FIOp->ChangeToImmediate(0);
2431+
}
2432+
2433+
// Try to improve legality by commuting.
2434+
if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2435+
std::swap(FIOp, OtherOp);
2436+
std::swap(FIOperandNum, OtherOpIdx);
2437+
}
2438+
2439+
for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2440+
// Depending on operand constraints we may need to insert another copy.
2441+
if (!TII->isOperandLegal(*MI, SrcIdx)) {
2442+
// If commuting didn't make the operands legal, we need to materialize
2443+
// in a register.
2444+
// TODO: Can use SGPR on gfx10+ in some cases.
2445+
if (!ScavengedVGPR) {
2446+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2447+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2448+
/*SPAdj=*/0);
2449+
}
2450+
2451+
assert(ScavengedVGPR != DstReg);
2452+
2453+
MachineOperand &Src = MI->getOperand(SrcIdx);
2454+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2455+
.add(Src);
2456+
2457+
Src.ChangeToRegister(ScavengedVGPR, false);
2458+
Src.setIsKill(true);
2459+
}
2460+
}
2461+
2462+
// Fold out add of 0 case that can appear in kernels.
2463+
if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2464+
if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2465+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2466+
}
2467+
2468+
MI->eraseFromParent();
2469+
}
2470+
2471+
return true;
2472+
}
22712473
case AMDGPU::S_ADD_I32:
22722474
case AMDGPU::S_OR_B32:
22732475
case AMDGPU::S_AND_B32: {
@@ -2336,7 +2538,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
23362538
} else {
23372539
if (MaterializedReg)
23382540
OtherOp.ChangeToRegister(MaterializedReg, false);
2339-
FIOp.ChangeToImmediate(NewOffset);
2541+
FIOp->ChangeToImmediate(NewOffset);
23402542
}
23412543

23422544
return true;
@@ -2354,7 +2556,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
23542556

23552557
// The offset is always swizzled, just replace it
23562558
if (FrameReg)
2357-
FIOp.ChangeToRegister(FrameReg, false);
2559+
FIOp->ChangeToRegister(FrameReg, false);
23582560

23592561
MachineOperand *OffsetOp =
23602562
TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
@@ -2407,18 +2609,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24072609
}
24082610

24092611
if (!FrameReg) {
2410-
FIOp.ChangeToImmediate(Offset);
2411-
if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
2612+
FIOp->ChangeToImmediate(Offset);
2613+
if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
24122614
return false;
24132615
}
24142616

24152617
// We need to use register here. Check if we can use an SGPR or need
24162618
// a VGPR.
2417-
FIOp.ChangeToRegister(AMDGPU::M0, false);
2418-
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
2619+
FIOp->ChangeToRegister(AMDGPU::M0, false);
2620+
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
24192621

24202622
if (!Offset && FrameReg && UseSGPR) {
2421-
FIOp.setReg(FrameReg);
2623+
FIOp->setReg(FrameReg);
24222624
return false;
24232625
}
24242626

@@ -2427,8 +2629,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24272629

24282630
Register TmpReg =
24292631
RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2430-
FIOp.setReg(TmpReg);
2431-
FIOp.setIsKill();
2632+
FIOp->setReg(TmpReg);
2633+
FIOp->setIsKill();
24322634

24332635
if ((!FrameReg || !Offset) && TmpReg) {
24342636
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2457,8 +2659,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24572659
if (!TmpSReg) {
24582660
// Use frame register and restore it after.
24592661
TmpSReg = FrameReg;
2460-
FIOp.setReg(FrameReg);
2461-
FIOp.setIsKill(false);
2662+
FIOp->setReg(FrameReg);
2663+
FIOp->setIsKill(false);
24622664
}
24632665

24642666
if (NeedSaveSCC) {
@@ -2706,7 +2908,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
27062908
MI->eraseFromParent();
27072909
return true;
27082910
}
2709-
FIOp.ChangeToRegister(ResultReg, false, false, true);
2911+
FIOp->ChangeToRegister(ResultReg, false, false, true);
27102912
return false;
27112913
}
27122914

@@ -2737,13 +2939,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
27372939
// If the offset is simply too big, don't convert to a scratch wave offset
27382940
// relative index.
27392941

2740-
FIOp.ChangeToImmediate(Offset);
2741-
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
2942+
FIOp->ChangeToImmediate(Offset);
2943+
if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
27422944
Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
27432945
MI, false, 0);
27442946
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
27452947
.addImm(Offset);
2746-
FIOp.ChangeToRegister(TmpReg, false, false, true);
2948+
FIOp->ChangeToRegister(TmpReg, false, false, true);
27472949
}
27482950
}
27492951
}

0 commit comments

Comments
 (0)