Skip to content

Commit b290ced

Browse files
committed
[AMDGPU] V_SET_INACTIVE optimizations
Optimize V_SET_INACTIVE by always running it in run in WWM. Allows WWM sections to be unbroken, and facilitates V_SET_INACTIVE to be be lower to V_CNDMASK in most cases. Some cases require use of exec manipulation V_MOV as previous code. GFX9 sees slight instruction count increase in edge cases due to smaller constant bus. Additionally: - Avoid introducing exec manipulation and V_MOVs where a source of V_SET_INACTIVE is the destination. - Lower any V_SET_INACTIVE not touched by marking to COPY.
1 parent 438ad9f commit b290ced

24 files changed

+3243
-4368
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 154 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2273,37 +2273,162 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
22732273
MI.eraseFromParent();
22742274
break;
22752275
}
2276-
case AMDGPU::V_SET_INACTIVE_B32: {
2277-
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2278-
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2279-
// FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2280-
// optimizations (mainly Register Coalescer) aware of WWM register liveness.
2281-
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2282-
.add(MI.getOperand(1));
2283-
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2284-
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2285-
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2286-
.add(MI.getOperand(2));
2287-
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2288-
.addReg(Exec);
2289-
MI.eraseFromParent();
2290-
break;
2291-
}
2276+
case AMDGPU::V_SET_INACTIVE_B32:
22922277
case AMDGPU::V_SET_INACTIVE_B64: {
22932278
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2294-
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2295-
MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2296-
MI.getOperand(0).getReg())
2297-
.add(MI.getOperand(1));
2298-
expandPostRAPseudo(*Copy);
2299-
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2300-
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2301-
Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2302-
MI.getOperand(0).getReg())
2303-
.add(MI.getOperand(2));
2304-
expandPostRAPseudo(*Copy);
2305-
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2306-
.addReg(Exec);
2279+
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2280+
unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
2281+
? AMDGPU::V_MOV_B64_PSEUDO
2282+
: AMDGPU::V_MOV_B32_e32;
2283+
Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2284+
Register DstReg = MI.getOperand(0).getReg();
2285+
MachineOperand &ActiveSrc = MI.getOperand(1);
2286+
MachineOperand &InactiveSrc = MI.getOperand(2);
2287+
2288+
// Find implicit register defining lanes active outside WWM.
2289+
// Note: default here is set to ExecReg so that functional MIR is still
2290+
// generated if implicit def is not found and assertions are disabled.
2291+
Register ExecSrcReg = ExecReg;
2292+
for (auto &Op : MI.implicit_operands()) {
2293+
if (Op.isDef() || !Op.isReg())
2294+
continue;
2295+
Register OpReg = Op.getReg();
2296+
if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
2297+
OpReg == AMDGPU::SCC)
2298+
continue;
2299+
ExecSrcReg = OpReg;
2300+
break;
2301+
}
2302+
assert(ExecSrcReg != ExecReg &&
2303+
"V_SET_INACTIVE must be in known WWM region");
2304+
2305+
// Ideally in WWM this operation is lowered to V_CNDMASK; however,
2306+
// constant bus constraints and the presence of literal constants
2307+
// present an issue.
2308+
// Fallback to V_MOV base lowering in all but the common cases.
2309+
const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2310+
const MachineFunction *MF = MI.getParent()->getParent();
2311+
const MachineRegisterInfo &MRI = MF->getRegInfo();
2312+
const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2313+
const MCInstrDesc &Desc = get(Opcode);
2314+
2315+
const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
2316+
const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
2317+
const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
2318+
const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
2319+
const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
2320+
const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
2321+
2322+
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2323+
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2324+
2325+
int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
2326+
int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
2327+
int ConstantBusUses =
2328+
1 + // Starts at 1 for ExecSrcReg
2329+
(usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
2330+
(usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
2331+
int LiteralConstants =
2332+
(ActiveSrc.isImm() && !isInlineConstant(ActiveImm) ? 1 : 0) +
2333+
(InactiveSrc.isImm() && !isInlineConstant(InactiveImm) ? 1 : 0);
2334+
2335+
bool UseVCndMask =
2336+
ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
2337+
if (VMov64 && UseVCndMask) {
2338+
// Decomposition must not introduce new literals.
2339+
UseVCndMask &=
2340+
ActiveSrc.isReg() ||
2341+
(isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmLo)) ||
2342+
(!isInlineConstant(ActiveImm));
2343+
UseVCndMask &= InactiveSrc.isReg() ||
2344+
(isInlineConstant(InactiveImmLo) &&
2345+
isInlineConstant(InactiveImmLo)) ||
2346+
(!isInlineConstant(InactiveImm));
2347+
}
2348+
2349+
if (UseVCndMask && VMov64) {
2350+
// Dual V_CNDMASK_B32
2351+
MachineOperand ActiveLo =
2352+
ActiveSrc.isReg()
2353+
? MachineOperand::CreateReg(
2354+
RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0), false,
2355+
/*isImp=*/false, /*isKill*/ false)
2356+
: MachineOperand::CreateImm(ActiveImmLo.getSExtValue());
2357+
MachineOperand ActiveHi =
2358+
ActiveSrc.isReg()
2359+
? MachineOperand::CreateReg(
2360+
RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1), false,
2361+
/*isImp=*/false, /*isKill*/ ActiveSrc.isKill())
2362+
: MachineOperand::CreateImm(ActiveImmHi.getSExtValue());
2363+
MachineOperand InactiveLo =
2364+
InactiveSrc.isReg()
2365+
? MachineOperand::CreateReg(
2366+
RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0), false,
2367+
/*isImp=*/false, /*isKill*/ false)
2368+
: MachineOperand::CreateImm(InactiveImmLo.getSExtValue());
2369+
MachineOperand InactiveHi =
2370+
InactiveSrc.isReg()
2371+
? MachineOperand::CreateReg(
2372+
RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1), false,
2373+
/*isImp=*/false, /*isKill*/ InactiveSrc.isKill())
2374+
: MachineOperand::CreateImm(InactiveImmHi.getSExtValue());
2375+
BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub0))
2376+
.addImm(0)
2377+
.add(InactiveLo)
2378+
.addImm(0)
2379+
.add(ActiveLo)
2380+
.addReg(ExecSrcReg)
2381+
.addReg(DstReg, RegState::ImplicitDefine);
2382+
BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub1))
2383+
.addImm(0)
2384+
.add(InactiveHi)
2385+
.addImm(0)
2386+
.add(ActiveHi)
2387+
.addReg(ExecSrcReg)
2388+
.addReg(DstReg, RegState::ImplicitDefine);
2389+
} else if (UseVCndMask) {
2390+
// Single V_CNDMASK_B32
2391+
BuildMI(MBB, MI, DL, get(Opcode), DstReg)
2392+
.addImm(0)
2393+
.add(InactiveSrc)
2394+
.addImm(0)
2395+
.add(ActiveSrc)
2396+
.addReg(ExecSrcReg);
2397+
} else {
2398+
// Fallback V_MOV case.
2399+
// Avoid unnecessary work if a source VGPR is also the destination.
2400+
// This can happen if WWM register allocation was efficient.
2401+
// Note: this assumes WWM execution.
2402+
bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
2403+
bool DstIsInactive =
2404+
InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
2405+
if (!DstIsInactive) {
2406+
// Set exec mask to inactive lanes,
2407+
// but only if active lanes would be overwritten.
2408+
if (DstIsActive) {
2409+
MachineInstr *ExecMI =
2410+
BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecSrcReg);
2411+
ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2412+
}
2413+
// Copy inactive lanes
2414+
MachineInstr *VMov =
2415+
BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
2416+
if (VMov64)
2417+
expandPostRAPseudo(*VMov);
2418+
}
2419+
if (!DstIsActive) {
2420+
// Set exec mask to active lanes
2421+
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
2422+
// Copy active lanes
2423+
MachineInstr *VMov =
2424+
BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
2425+
.add(ActiveSrc);
2426+
if (VMov64)
2427+
expandPostRAPseudo(*VMov);
2428+
}
2429+
// Restore WWM
2430+
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
2431+
}
23072432
MI.eraseFromParent();
23082433
break;
23092434
}

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ struct InstrInfo {
128128
char Needs = 0;
129129
char Disabled = 0;
130130
char OutNeeds = 0;
131+
char MarkedStates = 0;
131132
};
132133

133134
struct BlockInfo {
@@ -175,9 +176,10 @@ class SIWholeQuadMode : public MachineFunctionPass {
175176

176177
SmallVector<MachineInstr *, 2> LiveMaskQueries;
177178
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
178-
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179+
SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
179180
SmallVector<MachineInstr *, 4> KillInstrs;
180181
SmallVector<MachineInstr *, 4> InitExecInstrs;
182+
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
181183

182184
void printInfo();
183185

@@ -295,6 +297,9 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
295297

296298
assert(!(Flag & StateExact) && Flag != 0);
297299

300+
// Capture all states requested in marking including disabled ones.
301+
II.MarkedStates |= Flag;
302+
298303
// Remove any disabled states from the flag. The user that required it gets
299304
// an undefined value in the helper lanes. For example, this can happen if
300305
// the result of an atomic is used by instruction that requires WQM, where
@@ -478,7 +483,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
478483
std::vector<WorkItem> &Worklist) {
479484
char GlobalFlags = 0;
480485
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
481-
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
482486
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
483487
bool HasImplicitDerivatives =
484488
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -512,9 +516,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
512516
// The WQM intrinsic requires its output to have all the helper lanes
513517
// correct, so we need it to be in WQM.
514518
Flags = StateWQM;
515-
LowerToCopyInstrs.push_back(&MI);
519+
LowerToCopyInstrs.insert(&MI);
516520
} else if (Opcode == AMDGPU::SOFT_WQM) {
517-
LowerToCopyInstrs.push_back(&MI);
521+
LowerToCopyInstrs.insert(&MI);
518522
SoftWQMInstrs.push_back(&MI);
519523
} else if (Opcode == AMDGPU::STRICT_WWM) {
520524
// The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
@@ -555,16 +559,18 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
555559
GlobalFlags |= StateStrictWQM;
556560
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
557561
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
562+
// Disable strict states; StrictWQM will be added as required later.
558563
III.Disabled = StateStrict;
559564
MachineOperand &Inactive = MI.getOperand(2);
560565
if (Inactive.isReg()) {
561566
if (Inactive.isUndef()) {
562-
LowerToCopyInstrs.push_back(&MI);
567+
LowerToCopyInstrs.insert(&MI);
563568
} else {
564569
markOperand(MI, Inactive, StateStrictWWM, Worklist);
565570
}
566571
}
567572
SetInactiveInstrs.push_back(&MI);
573+
BBI.NeedsLowering = true;
568574
} else if (TII->isDisableWQM(MI)) {
569575
BBI.Needs |= StateExact;
570576
if (!(BBI.InNeeds & StateExact)) {
@@ -1042,6 +1048,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
10421048
LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
10431049

10441050
SmallVector<MachineInstr *, 4> SplitPoints;
1051+
Register ActiveLanesReg = 0;
10451052
char State = BI.InitialState;
10461053

10471054
for (MachineInstr &MI : llvm::make_early_inc_range(
@@ -1058,6 +1065,20 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
10581065
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10591066
SplitPoint = lowerKillF32(MBB, MI);
10601067
break;
1068+
case AMDGPU::ENTER_STRICT_WWM:
1069+
ActiveLanesReg = MI.getOperand(0).getReg();
1070+
break;
1071+
case AMDGPU::EXIT_STRICT_WWM:
1072+
ActiveLanesReg = 0;
1073+
break;
1074+
case AMDGPU::V_SET_INACTIVE_B32:
1075+
case AMDGPU::V_SET_INACTIVE_B64:
1076+
if (ActiveLanesReg) {
1077+
MI.addOperand(*MBB.getParent(),
1078+
MachineOperand::CreateReg(ActiveLanesReg, false, true));
1079+
} else
1080+
assert(State == StateExact || State == StateWQM);
1081+
break;
10611082
default:
10621083
break;
10631084
}
@@ -1497,13 +1518,14 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
14971518
}
14981519
}
14991520
for (MachineInstr *MI : LowerToCopyInstrs) {
1521+
LLVM_DEBUG(dbgs() << "simplify: " << *MI);
1522+
1523+
Register RecomputeReg = 0;
15001524
if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
15011525
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
15021526
assert(MI->getNumExplicitOperands() == 3);
1503-
// the only reason we should be here is V_SET_INACTIVE has
1504-
// an undef input so it is being replaced by a simple copy.
1505-
// There should be a second undef source that we should remove.
1506-
assert(MI->getOperand(2).isUndef());
1527+
if (MI->getOperand(2).isReg())
1528+
RecomputeReg = MI->getOperand(2).getReg();
15071529
MI->removeOperand(2);
15081530
MI->untieRegOperand(1);
15091531
} else {
@@ -1514,7 +1536,19 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
15141536
? (unsigned)AMDGPU::COPY
15151537
: TII->getMovOpcode(TRI->getRegClassForOperandReg(
15161538
*MRI, MI->getOperand(0)));
1539+
int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
1540+
while (Index >= 0) {
1541+
MI->removeOperand(Index);
1542+
Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
1543+
}
1544+
15171545
MI->setDesc(TII->get(CopyOp));
1546+
LLVM_DEBUG(dbgs() << " -> " << *MI);
1547+
1548+
if (RecomputeReg) {
1549+
LIS->removeInterval(RecomputeReg);
1550+
LIS->createAndComputeVirtRegInterval(RecomputeReg);
1551+
}
15181552
}
15191553
return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
15201554
}
@@ -1656,6 +1690,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
16561690
LowerToMovInstrs.clear();
16571691
KillInstrs.clear();
16581692
InitExecInstrs.clear();
1693+
SetInactiveInstrs.clear();
16591694
StateTransition.clear();
16601695

16611696
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1712,6 +1747,21 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
17121747
Changed = true;
17131748
}
17141749

1750+
// Check if V_SET_INACTIVE was touched by a strict state mode.
1751+
// If so, promote to WWM; otherwise lower to COPY.
1752+
for (MachineInstr *MI : SetInactiveInstrs) {
1753+
if (LowerToCopyInstrs.contains(MI))
1754+
continue;
1755+
if (Instructions[MI].MarkedStates & StateStrict) {
1756+
Instructions[MI].Needs |= StateStrictWWM;
1757+
Instructions[MI].Disabled &= ~StateStrictWWM;
1758+
Blocks[MI->getParent()].Needs |= StateStrictWWM;
1759+
} else {
1760+
LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
1761+
LowerToCopyInstrs.insert(MI);
1762+
}
1763+
}
1764+
17151765
LLVM_DEBUG(printInfo());
17161766

17171767
Changed |= lowerLiveMaskQueries();

0 commit comments

Comments
 (0)