Skip to content

Commit a2fea8e

Browse files
committed
[AMDGPU] V_SET_INACTIVE optimizations
Optimize V_SET_INACTIVE by allow it to run in WWM. Hence WWM sections are not broken up for inactive lane setting. WWM V_SET_INACTIVE can typically be lower to V_CNDMASK. Some cases require use of exec manipulation V_MOV as previous code. GFX9 sees slight instruction count increase in edge cases due to smaller constant bus. Additionally avoid introducing exec manipulation and V_MOVs where a source of V_SET_INACTIVE is the destination. This is a common pattern as WWM register pre-allocation often assigns the same register.
1 parent 8d28a41 commit a2fea8e

18 files changed

+2104
-3069
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 140 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2270,37 +2270,148 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
22702270
MI.eraseFromParent();
22712271
break;
22722272
}
2273-
case AMDGPU::V_SET_INACTIVE_B32: {
2274-
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2275-
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2276-
// FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2277-
// optimizations (mainly Register Coalescer) aware of WWM register liveness.
2278-
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2279-
.add(MI.getOperand(1));
2280-
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2281-
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2282-
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2283-
.add(MI.getOperand(2));
2284-
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2285-
.addReg(Exec);
2286-
MI.eraseFromParent();
2287-
break;
2288-
}
2273+
case AMDGPU::V_SET_INACTIVE_B32:
22892274
case AMDGPU::V_SET_INACTIVE_B64: {
22902275
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2291-
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2292-
MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2293-
MI.getOperand(0).getReg())
2294-
.add(MI.getOperand(1));
2295-
expandPostRAPseudo(*Copy);
2296-
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2297-
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2298-
Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2299-
MI.getOperand(0).getReg())
2300-
.add(MI.getOperand(2));
2301-
expandPostRAPseudo(*Copy);
2302-
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2303-
.addReg(Exec);
2276+
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2277+
unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
2278+
? AMDGPU::V_MOV_B64_PSEUDO
2279+
: AMDGPU::V_MOV_B32_e32;
2280+
Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2281+
2282+
Register DstReg = MI.getOperand(0).getReg();
2283+
MachineOperand &ActiveSrc = MI.getOperand(1);
2284+
MachineOperand &InactiveSrc = MI.getOperand(2);
2285+
2286+
bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2287+
2288+
// Find implicit exec src if this is running in WWM.
2289+
Register ExecSrcReg = 0;
2290+
for (auto &Op : MI.implicit_operands()) {
2291+
if (Op.isDef() || !Op.isReg())
2292+
continue;
2293+
Register OpReg = Op.getReg();
2294+
if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
2295+
OpReg == AMDGPU::SCC)
2296+
continue;
2297+
ExecSrcReg = OpReg;
2298+
break;
2299+
}
2300+
2301+
// Ideally in WWM this operation is lowered to V_CNDMASK; however,
2302+
// constant bus constraints and the presence of literal constants
2303+
// present an issue.
2304+
// Fallback to V_MOV base lowering in all but the common cases.
2305+
bool InWWM = !!ExecSrcReg;
2306+
bool UseVCndMask = false;
2307+
if (InWWM) {
2308+
const MachineFunction *MF = MI.getParent()->getParent();
2309+
const MachineRegisterInfo &MRI = MF->getRegInfo();
2310+
const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2311+
const MCInstrDesc &Desc = get(Opcode);
2312+
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2313+
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2314+
int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
2315+
int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
2316+
int ConstantBusUses = 1; // Starts at one for ExecRegSrc
2317+
int LiteralConstants = 0;
2318+
ConstantBusUses +=
2319+
usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0;
2320+
ConstantBusUses +=
2321+
usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0;
2322+
LiteralConstants +=
2323+
ActiveSrc.isImm() &&
2324+
!isInlineConstant(ActiveSrc, Desc.operands()[Src1Idx])
2325+
? 1
2326+
: 0;
2327+
LiteralConstants +=
2328+
InactiveSrc.isImm() &&
2329+
!isInlineConstant(InactiveSrc, Desc.operands()[Src0Idx])
2330+
? 1
2331+
: 0;
2332+
UseVCndMask = ConstantBusUses <= ConstantBusLimit &&
2333+
LiteralConstants <= LiteralLimit &&
2334+
(!VMov64 || (ActiveSrc.isReg() && InactiveSrc.isReg()));
2335+
}
2336+
2337+
if (UseVCndMask && VMov64) {
2338+
// WWM B64; decompose to two B32 operations.
2339+
// Test above ensures that both sources are registers.
2340+
// Note: this is done to avoid falling back to V_MOV multiple times
2341+
// and introducing exec manipulation for each VGPR separately.
2342+
assert(ActiveSrc.isReg() && InactiveSrc.isReg());
2343+
Register ActiveLo = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0);
2344+
Register ActiveHi = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1);
2345+
Register InactiveLo = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0);
2346+
Register InactiveHi = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1);
2347+
MachineInstr *Tmp;
2348+
Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
2349+
RI.getSubReg(DstReg, AMDGPU::sub0))
2350+
.addReg(InactiveLo)
2351+
.addReg(ActiveLo)
2352+
.addReg(ExecSrcReg, RegState::Implicit)
2353+
.addReg(DstReg, RegState::ImplicitDefine);
2354+
expandPostRAPseudo(*Tmp);
2355+
Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
2356+
RI.getSubReg(DstReg, AMDGPU::sub1))
2357+
.addReg(InactiveHi, InactiveSrc.isKill() ? RegState::Kill : 0)
2358+
.addReg(ActiveHi, ActiveSrc.isKill() ? RegState::Kill : 0)
2359+
.addReg(ExecSrcReg, RegState::Implicit)
2360+
.addReg(DstReg, RegState::ImplicitDefine);
2361+
expandPostRAPseudo(*Tmp);
2362+
} else if (UseVCndMask) {
2363+
// WWM B32; use V_CNDMASK.
2364+
MachineInstr *VCndMask =
2365+
BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2366+
.addImm(0)
2367+
.add(InactiveSrc)
2368+
.addImm(0)
2369+
.add(ActiveSrc)
2370+
.addReg(ExecSrcReg);
2371+
// Copy implicit defs in case this is part of V_SET_INACTIVE_B64.
2372+
for (auto &Op : MI.implicit_operands()) {
2373+
if (!Op.isDef())
2374+
continue;
2375+
VCndMask->addOperand(Op);
2376+
}
2377+
} else {
2378+
// Fallback V_MOV case.
2379+
// Avoid unnecessary work if a src is the destination.
2380+
// This can happen if WWM register allocation was efficient.
2381+
bool SkipActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
2382+
bool SkipInactive = InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
2383+
if (!SkipActive) {
2384+
if (InWWM) {
2385+
// Cancel WWM
2386+
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
2387+
}
2388+
// Copy active lanes
2389+
MachineInstr *VMov =
2390+
BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
2391+
.add(ActiveSrc);
2392+
if (VMov64)
2393+
expandPostRAPseudo(*VMov);
2394+
}
2395+
if (!SkipInactive) {
2396+
// Set exec mask to inactive lanes
2397+
MachineInstr *ExecMI = BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
2398+
.addReg(InWWM ? ExecSrcReg : ExecReg);
2399+
ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2400+
// Copy inactive lanes
2401+
MachineInstr *VMov =
2402+
BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
2403+
if (VMov64)
2404+
expandPostRAPseudo(*VMov);
2405+
if (!InWWM) {
2406+
// Restore original exec mask
2407+
BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecReg);
2408+
}
2409+
}
2410+
if (InWWM) {
2411+
// Restore WWM
2412+
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
2413+
}
2414+
}
23042415
MI.eraseFromParent();
23052416
break;
23062417
}

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
178178
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179179
SmallVector<MachineInstr *, 4> KillInstrs;
180180
SmallVector<MachineInstr *, 4> InitExecInstrs;
181+
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
181182

182183
void printInfo();
183184

@@ -226,6 +227,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
226227
MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
227228
bool &Changed);
228229

230+
void harmonizeTransitions();
231+
229232
public:
230233
static char ID;
231234

@@ -478,7 +481,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
478481
std::vector<WorkItem> &Worklist) {
479482
char GlobalFlags = 0;
480483
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
481-
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
482484
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
483485
bool HasImplicitDerivatives =
484486
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -555,6 +557,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
555557
GlobalFlags |= StateStrictWQM;
556558
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
557559
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
560+
// Disable strict states here while marking, relax it later.
558561
III.Disabled = StateStrict;
559562
MachineOperand &Inactive = MI.getOperand(2);
560563
if (Inactive.isReg()) {
@@ -565,6 +568,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
565568
}
566569
}
567570
SetInactiveInstrs.push_back(&MI);
571+
BBI.NeedsLowering = true;
568572
} else if (TII->isDisableWQM(MI)) {
569573
BBI.Needs |= StateExact;
570574
if (!(BBI.InNeeds & StateExact)) {
@@ -1042,6 +1046,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
10421046
LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
10431047

10441048
SmallVector<MachineInstr *, 4> SplitPoints;
1049+
Register ActiveLanesReg = 0;
10451050
char State = BI.InitialState;
10461051

10471052
for (MachineInstr &MI : llvm::make_early_inc_range(
@@ -1058,6 +1063,20 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
10581063
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10591064
SplitPoint = lowerKillF32(MBB, MI);
10601065
break;
1066+
case AMDGPU::ENTER_STRICT_WWM:
1067+
ActiveLanesReg = MI.getOperand(0).getReg();
1068+
break;
1069+
case AMDGPU::EXIT_STRICT_WWM:
1070+
ActiveLanesReg = 0;
1071+
break;
1072+
case AMDGPU::V_SET_INACTIVE_B32:
1073+
case AMDGPU::V_SET_INACTIVE_B64:
1074+
if (ActiveLanesReg) {
1075+
MI.addOperand(*MBB.getParent(),
1076+
MachineOperand::CreateReg(ActiveLanesReg, false, true));
1077+
} else
1078+
assert(State == StateExact || State == StateWQM);
1079+
break;
10611080
default:
10621081
break;
10631082
}
@@ -1628,6 +1647,40 @@ SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
16281647
return InsertPt;
16291648
}
16301649

1650+
void SIWholeQuadMode::harmonizeTransitions() {
1651+
// Relax requirements on SET_INACTIVE to allow it in WWM regions.
1652+
for (MachineInstr *MI : SetInactiveInstrs) {
1653+
if (MI->getOpcode() == AMDGPU::COPY)
1654+
continue;
1655+
1656+
Instructions[MI].Disabled &= ~StateStrictWWM;
1657+
1658+
auto MBB = MI->getParent();
1659+
auto It = MI->getIterator();
1660+
if (It == MBB->end())
1661+
continue;
1662+
1663+
bool AddWWM = false;
1664+
auto NextMI = std::next(It);
1665+
if (NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1666+
NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1667+
// Groups of SET_INACTIVE are more efficient in WWM.
1668+
AddWWM = true;
1669+
} else {
1670+
// Back propagate WWM needs of next instruction.
1671+
auto III = Instructions.find(&*NextMI);
1672+
AddWWM =
1673+
(III != Instructions.end() && III->second.Needs & StateStrictWWM);
1674+
}
1675+
1676+
if (!AddWWM)
1677+
continue;
1678+
1679+
LLVM_DEBUG(dbgs() << "merge into WWM: " << *MI);
1680+
Instructions[MI].Needs |= StateStrictWWM;
1681+
}
1682+
}
1683+
16311684
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
16321685
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
16331686
<< " ------------- \n");
@@ -1640,6 +1693,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
16401693
LowerToMovInstrs.clear();
16411694
KillInstrs.clear();
16421695
InitExecInstrs.clear();
1696+
SetInactiveInstrs.clear();
16431697
StateTransition.clear();
16441698

16451699
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1713,6 +1767,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
17131767
Changed = true;
17141768
} else {
17151769
// Wave mode switching requires full lowering pass.
1770+
harmonizeTransitions();
17161771
for (auto BII : Blocks)
17171772
processBlock(*BII.first, BII.first == &Entry);
17181773
// Lowering blocks causes block splitting so perform as a second pass.

0 commit comments

Comments
 (0)