Skip to content

Commit 2a86991

Browse files
arsenmeasyonaadit
authored andcommitted
AMDGPU: Fix broken frame index expansion for v_add_co_u32_e64 (llvm#114634)
With an explicit carry out operand, one too many operands were deleted resulting in a malformed v_mov_b32.
1 parent dfef124 commit 2a86991

17 files changed

+1367
-1338
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2445,6 +2445,306 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24452445
MI->eraseFromParent();
24462446
return true;
24472447
}
2448+
case AMDGPU::V_ADD_U32_e32:
2449+
case AMDGPU::V_ADD_U32_e64:
2450+
case AMDGPU::V_ADD_CO_U32_e32:
2451+
case AMDGPU::V_ADD_CO_U32_e64: {
2452+
// TODO: Handle sub, and, or.
2453+
unsigned NumDefs = MI->getNumExplicitDefs();
2454+
unsigned Src0Idx = NumDefs;
2455+
2456+
bool HasClamp = false;
2457+
MachineOperand *VCCOp = nullptr;
2458+
2459+
switch (MI->getOpcode()) {
2460+
case AMDGPU::V_ADD_U32_e32:
2461+
break;
2462+
case AMDGPU::V_ADD_U32_e64:
2463+
HasClamp = MI->getOperand(3).getImm();
2464+
break;
2465+
case AMDGPU::V_ADD_CO_U32_e32:
2466+
VCCOp = &MI->getOperand(3);
2467+
break;
2468+
case AMDGPU::V_ADD_CO_U32_e64:
2469+
VCCOp = &MI->getOperand(1);
2470+
HasClamp = MI->getOperand(4).getImm();
2471+
break;
2472+
default:
2473+
break;
2474+
}
2475+
bool DeadVCC = !VCCOp || VCCOp->isDead();
2476+
MachineOperand &DstOp = MI->getOperand(0);
2477+
Register DstReg = DstOp.getReg();
2478+
2479+
unsigned OtherOpIdx =
2480+
FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2481+
MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2482+
2483+
unsigned Src1Idx = Src0Idx + 1;
2484+
Register MaterializedReg = FrameReg;
2485+
Register ScavengedVGPR;
2486+
2487+
if (FrameReg && !ST.enableFlatScratch()) {
2488+
// We should just do an in-place update of the result register. However,
2489+
// the value there may also be used by the add, in which case we need a
2490+
// temporary register.
2491+
//
2492+
// FIXME: The scavenger is not finding the result register in the
2493+
// common case where the add does not read the register.
2494+
2495+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2496+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2497+
2498+
// TODO: If we have a free SGPR, it's sometimes better to use a scalar
2499+
// shift.
2500+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2501+
.addDef(ScavengedVGPR, RegState::Renamable)
2502+
.addImm(ST.getWavefrontSizeLog2())
2503+
.addReg(FrameReg);
2504+
MaterializedReg = ScavengedVGPR;
2505+
}
2506+
2507+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2508+
// For the non-immediate case, we could fall through to the default
2509+
// handling, but we do an in-place update of the result register here to
2510+
// avoid scavenging another register.
2511+
if (OtherOp->isImm()) {
2512+
OtherOp->setImm(OtherOp->getImm() + Offset);
2513+
Offset = 0;
2514+
}
2515+
2516+
if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2517+
if (ST.enableFlatScratch() &&
2518+
!TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2519+
// We didn't need the shift above, so we have an SGPR for the frame
2520+
// register, but may have a VGPR only operand.
2521+
//
2522+
// TODO: On gfx10+, we can easily change the opcode to the e64 version
2523+
// and use the higher constant bus restriction to avoid this copy.
2524+
2525+
if (!ScavengedVGPR) {
2526+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2527+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2528+
/*SPAdj=*/0);
2529+
}
2530+
2531+
assert(ScavengedVGPR != DstReg);
2532+
2533+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2534+
.addReg(MaterializedReg,
2535+
MaterializedReg != FrameReg ? RegState::Kill : 0);
2536+
MaterializedReg = ScavengedVGPR;
2537+
}
2538+
2539+
// TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2540+
// is not live, we could use a scalar add + vector add instead of 2
2541+
// vector adds.
2542+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2543+
.addDef(DstReg, RegState::Renamable);
2544+
if (NumDefs == 2)
2545+
AddI32.add(MI->getOperand(1));
2546+
2547+
unsigned MaterializedRegFlags =
2548+
MaterializedReg != FrameReg ? RegState::Kill : 0;
2549+
2550+
if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2551+
// If we know we have a VGPR already, it's more likely the other
2552+
// operand is a legal vsrc0.
2553+
AddI32
2554+
.add(*OtherOp)
2555+
.addReg(MaterializedReg, MaterializedRegFlags);
2556+
} else {
2557+
// Commute operands to avoid violating VOP2 restrictions. This will
2558+
// typically happen when using scratch.
2559+
AddI32
2560+
.addReg(MaterializedReg, MaterializedRegFlags)
2561+
.add(*OtherOp);
2562+
}
2563+
2564+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2565+
MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2566+
AddI32.addImm(0); // clamp
2567+
2568+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2569+
AddI32.setOperandDead(3); // Dead vcc
2570+
2571+
MaterializedReg = DstReg;
2572+
2573+
OtherOp->ChangeToRegister(MaterializedReg, false);
2574+
OtherOp->setIsKill(true);
2575+
FIOp->ChangeToImmediate(Offset);
2576+
Offset = 0;
2577+
} else if (Offset != 0) {
2578+
assert(!MaterializedReg);
2579+
FIOp->ChangeToImmediate(Offset);
2580+
Offset = 0;
2581+
} else {
2582+
if (DeadVCC && !HasClamp) {
2583+
assert(Offset == 0);
2584+
2585+
// TODO: Losing kills and implicit operands. Just mutate to copy and
2586+
// let lowerCopy deal with it?
2587+
if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2588+
// Folded to an identity copy.
2589+
MI->eraseFromParent();
2590+
return true;
2591+
}
2592+
2593+
// The immediate value should be in OtherOp
2594+
MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2595+
MI->removeOperand(FIOperandNum);
2596+
2597+
unsigned NumOps = MI->getNumOperands();
2598+
for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2599+
MI->removeOperand(I);
2600+
2601+
if (NumDefs == 2)
2602+
MI->removeOperand(1);
2603+
2604+
// The code below can't deal with a mov.
2605+
return true;
2606+
}
2607+
2608+
// This folded to a constant, but we have to keep the add around for
2609+
// pointless implicit defs or clamp modifier.
2610+
FIOp->ChangeToImmediate(0);
2611+
}
2612+
2613+
// Try to improve legality by commuting.
2614+
if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2615+
std::swap(FIOp, OtherOp);
2616+
std::swap(FIOperandNum, OtherOpIdx);
2617+
}
2618+
2619+
for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2620+
// Depending on operand constraints we may need to insert another copy.
2621+
if (!TII->isOperandLegal(*MI, SrcIdx)) {
2622+
// If commuting didn't make the operands legal, we need to materialize
2623+
// in a register.
2624+
// TODO: Can use SGPR on gfx10+ in some cases.
2625+
if (!ScavengedVGPR) {
2626+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2627+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2628+
/*SPAdj=*/0);
2629+
}
2630+
2631+
assert(ScavengedVGPR != DstReg);
2632+
2633+
MachineOperand &Src = MI->getOperand(SrcIdx);
2634+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2635+
.add(Src);
2636+
2637+
Src.ChangeToRegister(ScavengedVGPR, false);
2638+
Src.setIsKill(true);
2639+
}
2640+
}
2641+
2642+
// Fold out add of 0 case that can appear in kernels.
2643+
if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2644+
if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2645+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2646+
}
2647+
2648+
MI->eraseFromParent();
2649+
}
2650+
2651+
return true;
2652+
}
2653+
case AMDGPU::S_ADD_I32: {
2654+
// TODO: Handle s_or_b32, s_and_b32.
2655+
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2656+
MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2657+
2658+
assert(FrameReg || MFI->isBottomOfStack());
2659+
2660+
MachineOperand &DstOp = MI->getOperand(0);
2661+
const DebugLoc &DL = MI->getDebugLoc();
2662+
Register MaterializedReg = FrameReg;
2663+
2664+
// Defend against live scc, which should never happen in practice.
2665+
bool DeadSCC = MI->getOperand(3).isDead();
2666+
2667+
Register TmpReg;
2668+
2669+
// FIXME: Scavenger should figure out that the result register is
2670+
// available. Also should do this for the v_add case.
2671+
if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2672+
TmpReg = DstOp.getReg();
2673+
2674+
if (FrameReg && !ST.enableFlatScratch()) {
2675+
// FIXME: In the common case where the add does not also read its result
2676+
// (i.e. this isn't a reg += fi), it's not finding the dest reg as
2677+
// available.
2678+
if (!TmpReg)
2679+
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2680+
MI, false, 0);
2681+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2682+
.addDef(TmpReg, RegState::Renamable)
2683+
.addReg(FrameReg)
2684+
.addImm(ST.getWavefrontSizeLog2())
2685+
.setOperandDead(3); // Set SCC dead
2686+
MaterializedReg = TmpReg;
2687+
}
2688+
2689+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2690+
2691+
// For the non-immediate case, we could fall through to the default
2692+
// handling, but we do an in-place update of the result register here to
2693+
// avoid scavenging another register.
2694+
if (OtherOp.isImm()) {
2695+
OtherOp.setImm(OtherOp.getImm() + Offset);
2696+
Offset = 0;
2697+
2698+
if (MaterializedReg)
2699+
FIOp->ChangeToRegister(MaterializedReg, false);
2700+
else
2701+
FIOp->ChangeToImmediate(0);
2702+
} else if (MaterializedReg) {
2703+
// If we can't fold the other operand, do another increment.
2704+
Register DstReg = DstOp.getReg();
2705+
2706+
if (!TmpReg && MaterializedReg == FrameReg) {
2707+
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2708+
MI, /*RestoreAfter=*/false, 0,
2709+
/*AllowSpill=*/false);
2710+
DstReg = TmpReg;
2711+
}
2712+
2713+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
2714+
.addDef(DstReg, RegState::Renamable)
2715+
.addReg(MaterializedReg, RegState::Kill)
2716+
.add(OtherOp);
2717+
if (DeadSCC)
2718+
AddI32.setOperandDead(3);
2719+
2720+
MaterializedReg = DstReg;
2721+
2722+
OtherOp.ChangeToRegister(MaterializedReg, false);
2723+
OtherOp.setIsKill(true);
2724+
OtherOp.setIsRenamable(true);
2725+
FIOp->ChangeToImmediate(Offset);
2726+
} else {
2727+
// If we don't have any other offset to apply, we can just directly
2728+
// interpret the frame index as the offset.
2729+
FIOp->ChangeToImmediate(Offset);
2730+
}
2731+
2732+
if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2733+
assert(Offset == 0);
2734+
MI->removeOperand(3);
2735+
MI->removeOperand(OtherOpIdx);
2736+
MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2737+
} else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2738+
assert(Offset == 0);
2739+
MI->removeOperand(3);
2740+
MI->removeOperand(FIOperandNum);
2741+
MI->setDesc(
2742+
TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2743+
}
2744+
2745+
assert(!FIOp->isFI());
2746+
return true;
2747+
}
24482748
default: {
24492749
// Other access to frame index
24502750
const DebugLoc &DL = MI->getDebugLoc();

0 commit comments

Comments
 (0)