Skip to content

Commit 5736b08

Browse files
committed
[AMDGPU] V_SET_INACTIVE optimizations
Optimize V_SET_INACTIVE by allow it to run in WWM. Hence WWM sections are not broken up for inactive lane setting. WWM V_SET_INACTIVE can typically be lower to V_CNDMASK. Some cases require use of exec manipulation V_MOV as previous code. GFX9 sees slight instruction count increase in edge cases due to smaller constant bus. Additionally avoid introducing exec manipulation and V_MOVs where a source of V_SET_INACTIVE is the destination. This is a common pattern as WWM register pre-allocation often assigns the same register.
1 parent 5740044 commit 5736b08

17 files changed

+954
-1320
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 140 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2273,37 +2273,148 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
22732273
MI.eraseFromParent();
22742274
break;
22752275
}
2276-
case AMDGPU::V_SET_INACTIVE_B32: {
2277-
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2278-
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2279-
// FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2280-
// optimizations (mainly Register Coalescer) aware of WWM register liveness.
2281-
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2282-
.add(MI.getOperand(1));
2283-
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2284-
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2285-
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2286-
.add(MI.getOperand(2));
2287-
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2288-
.addReg(Exec);
2289-
MI.eraseFromParent();
2290-
break;
2291-
}
2276+
case AMDGPU::V_SET_INACTIVE_B32:
22922277
case AMDGPU::V_SET_INACTIVE_B64: {
22932278
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2294-
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2295-
MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2296-
MI.getOperand(0).getReg())
2297-
.add(MI.getOperand(1));
2298-
expandPostRAPseudo(*Copy);
2299-
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2300-
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2301-
Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2302-
MI.getOperand(0).getReg())
2303-
.add(MI.getOperand(2));
2304-
expandPostRAPseudo(*Copy);
2305-
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2306-
.addReg(Exec);
2279+
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2280+
unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
2281+
? AMDGPU::V_MOV_B64_PSEUDO
2282+
: AMDGPU::V_MOV_B32_e32;
2283+
Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2284+
2285+
Register DstReg = MI.getOperand(0).getReg();
2286+
MachineOperand &ActiveSrc = MI.getOperand(1);
2287+
MachineOperand &InactiveSrc = MI.getOperand(2);
2288+
2289+
bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2290+
2291+
// Find implicit exec src if this is running in WWM.
2292+
Register ExecSrcReg = 0;
2293+
for (auto &Op : MI.implicit_operands()) {
2294+
if (Op.isDef() || !Op.isReg())
2295+
continue;
2296+
Register OpReg = Op.getReg();
2297+
if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
2298+
OpReg == AMDGPU::SCC)
2299+
continue;
2300+
ExecSrcReg = OpReg;
2301+
break;
2302+
}
2303+
2304+
// Ideally in WWM this operation is lowered to V_CNDMASK; however,
2305+
// constant bus constraints and the presence of literal constants
2306+
// present an issue.
2307+
// Fallback to V_MOV base lowering in all but the common cases.
2308+
bool InWWM = !!ExecSrcReg;
2309+
bool UseVCndMask = false;
2310+
if (InWWM) {
2311+
const MachineFunction *MF = MI.getParent()->getParent();
2312+
const MachineRegisterInfo &MRI = MF->getRegInfo();
2313+
const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2314+
const MCInstrDesc &Desc = get(Opcode);
2315+
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2316+
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2317+
int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
2318+
int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
2319+
int ConstantBusUses = 1; // Starts at one for ExecRegSrc
2320+
int LiteralConstants = 0;
2321+
ConstantBusUses +=
2322+
usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0;
2323+
ConstantBusUses +=
2324+
usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0;
2325+
LiteralConstants +=
2326+
ActiveSrc.isImm() &&
2327+
!isInlineConstant(ActiveSrc, Desc.operands()[Src1Idx])
2328+
? 1
2329+
: 0;
2330+
LiteralConstants +=
2331+
InactiveSrc.isImm() &&
2332+
!isInlineConstant(InactiveSrc, Desc.operands()[Src0Idx])
2333+
? 1
2334+
: 0;
2335+
UseVCndMask = ConstantBusUses <= ConstantBusLimit &&
2336+
LiteralConstants <= LiteralLimit &&
2337+
(!VMov64 || (ActiveSrc.isReg() && InactiveSrc.isReg()));
2338+
}
2339+
2340+
if (UseVCndMask && VMov64) {
2341+
// WWM B64; decompose to two B32 operations.
2342+
// Test above ensures that both sources are registers.
2343+
// Note: this is done to avoid falling back to V_MOV multiple times
2344+
// and introducing exec manipulation for each VGPR separately.
2345+
assert(ActiveSrc.isReg() && InactiveSrc.isReg());
2346+
Register ActiveLo = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0);
2347+
Register ActiveHi = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1);
2348+
Register InactiveLo = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0);
2349+
Register InactiveHi = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1);
2350+
MachineInstr *Tmp;
2351+
Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
2352+
RI.getSubReg(DstReg, AMDGPU::sub0))
2353+
.addReg(InactiveLo)
2354+
.addReg(ActiveLo)
2355+
.addReg(ExecSrcReg, RegState::Implicit)
2356+
.addReg(DstReg, RegState::ImplicitDefine);
2357+
expandPostRAPseudo(*Tmp);
2358+
Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
2359+
RI.getSubReg(DstReg, AMDGPU::sub1))
2360+
.addReg(InactiveHi, InactiveSrc.isKill() ? RegState::Kill : 0)
2361+
.addReg(ActiveHi, ActiveSrc.isKill() ? RegState::Kill : 0)
2362+
.addReg(ExecSrcReg, RegState::Implicit)
2363+
.addReg(DstReg, RegState::ImplicitDefine);
2364+
expandPostRAPseudo(*Tmp);
2365+
} else if (UseVCndMask) {
2366+
// WWM B32; use V_CNDMASK.
2367+
MachineInstr *VCndMask =
2368+
BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2369+
.addImm(0)
2370+
.add(InactiveSrc)
2371+
.addImm(0)
2372+
.add(ActiveSrc)
2373+
.addReg(ExecSrcReg);
2374+
// Copy implicit defs in case this is part of V_SET_INACTIVE_B64.
2375+
for (auto &Op : MI.implicit_operands()) {
2376+
if (!Op.isDef())
2377+
continue;
2378+
VCndMask->addOperand(Op);
2379+
}
2380+
} else {
2381+
// Fallback V_MOV case.
2382+
// Avoid unnecessary work if a src is the destination.
2383+
// This can happen if WWM register allocation was efficient.
2384+
bool SkipActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
2385+
bool SkipInactive = InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
2386+
if (!SkipActive) {
2387+
if (InWWM) {
2388+
// Cancel WWM
2389+
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
2390+
}
2391+
// Copy active lanes
2392+
MachineInstr *VMov =
2393+
BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
2394+
.add(ActiveSrc);
2395+
if (VMov64)
2396+
expandPostRAPseudo(*VMov);
2397+
}
2398+
if (!SkipInactive) {
2399+
// Set exec mask to inactive lanes
2400+
MachineInstr *ExecMI = BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
2401+
.addReg(InWWM ? ExecSrcReg : ExecReg);
2402+
ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2403+
// Copy inactive lanes
2404+
MachineInstr *VMov =
2405+
BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
2406+
if (VMov64)
2407+
expandPostRAPseudo(*VMov);
2408+
if (!InWWM) {
2409+
// Restore original exec mask
2410+
BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecReg);
2411+
}
2412+
}
2413+
if (InWWM) {
2414+
// Restore WWM
2415+
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
2416+
}
2417+
}
23072418
MI.eraseFromParent();
23082419
break;
23092420
}

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
178178
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179179
SmallVector<MachineInstr *, 4> KillInstrs;
180180
SmallVector<MachineInstr *, 4> InitExecInstrs;
181+
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
181182

182183
void printInfo();
183184

@@ -225,6 +226,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
225226
void lowerInitExec(MachineInstr &MI);
226227
MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
227228

229+
void harmonizeTransitions();
230+
228231
public:
229232
static char ID;
230233

@@ -477,7 +480,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
477480
std::vector<WorkItem> &Worklist) {
478481
char GlobalFlags = 0;
479482
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
480-
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
481483
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
482484
bool HasImplicitDerivatives =
483485
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -554,6 +556,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
554556
GlobalFlags |= StateStrictWQM;
555557
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
556558
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
559+
// Disable strict states here while marking, relax it later.
557560
III.Disabled = StateStrict;
558561
MachineOperand &Inactive = MI.getOperand(2);
559562
if (Inactive.isReg()) {
@@ -565,6 +568,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
565568
}
566569
SetInactiveInstrs.push_back(&MI);
567570
GlobalFlags |= StateStrictWWM;
571+
BBI.NeedsLowering = true;
568572
} else if (TII->isDisableWQM(MI)) {
569573
BBI.Needs |= StateExact;
570574
if (!(BBI.InNeeds & StateExact)) {
@@ -1038,6 +1042,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
10381042
LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
10391043

10401044
SmallVector<MachineInstr *, 4> SplitPoints;
1045+
Register ActiveLanesReg = 0;
10411046
char State = BI.InitialState;
10421047

10431048
for (MachineInstr &MI : llvm::make_early_inc_range(
@@ -1054,6 +1059,20 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
10541059
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10551060
SplitPoint = lowerKillF32(MBB, MI);
10561061
break;
1062+
case AMDGPU::ENTER_STRICT_WWM:
1063+
ActiveLanesReg = MI.getOperand(0).getReg();
1064+
break;
1065+
case AMDGPU::EXIT_STRICT_WWM:
1066+
ActiveLanesReg = 0;
1067+
break;
1068+
case AMDGPU::V_SET_INACTIVE_B32:
1069+
case AMDGPU::V_SET_INACTIVE_B64:
1070+
if (ActiveLanesReg) {
1071+
MI.addOperand(*MBB.getParent(),
1072+
MachineOperand::CreateReg(ActiveLanesReg, false, true));
1073+
} else
1074+
assert(State == StateExact || State == StateWQM);
1075+
break;
10571076
default:
10581077
break;
10591078
}
@@ -1618,6 +1637,40 @@ SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
16181637
return InsertPt;
16191638
}
16201639

1640+
void SIWholeQuadMode::harmonizeTransitions() {
1641+
// Relax requirements on SET_INACTIVE to allow it in WWM regions.
1642+
for (MachineInstr *MI : SetInactiveInstrs) {
1643+
if (MI->getOpcode() == AMDGPU::COPY)
1644+
continue;
1645+
1646+
Instructions[MI].Disabled &= ~StateStrictWWM;
1647+
1648+
auto MBB = MI->getParent();
1649+
auto It = MI->getIterator();
1650+
if (It == MBB->end())
1651+
continue;
1652+
1653+
bool AddWWM = false;
1654+
auto NextMI = std::next(It);
1655+
if (NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1656+
NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1657+
// Groups of SET_INACTIVE are more efficient in WWM.
1658+
AddWWM = true;
1659+
} else {
1660+
// Back propagate WWM needs of next instruction.
1661+
auto III = Instructions.find(&*NextMI);
1662+
AddWWM =
1663+
(III != Instructions.end() && III->second.Needs & StateStrictWWM);
1664+
}
1665+
1666+
if (!AddWWM)
1667+
continue;
1668+
1669+
LLVM_DEBUG(dbgs() << "merge into WWM: " << *MI);
1670+
Instructions[MI].Needs |= StateStrictWWM;
1671+
}
1672+
}
1673+
16211674
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
16221675
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
16231676
<< " ------------- \n");
@@ -1630,6 +1683,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
16301683
LowerToMovInstrs.clear();
16311684
KillInstrs.clear();
16321685
InitExecInstrs.clear();
1686+
SetInactiveInstrs.clear();
16331687
StateTransition.clear();
16341688

16351689
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1702,6 +1756,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
17021756
LIS->InsertMachineInstrInMaps(*MI);
17031757
lowerKillInstrs(true);
17041758
} else {
1759+
harmonizeTransitions();
17051760
for (auto BII : Blocks)
17061761
processBlock(*BII.first, BII.first == &Entry);
17071762
// Lowering blocks causes block splitting so perform as a second pass.

0 commit comments

Comments
 (0)