Skip to content

Commit 16cda01

Browse files
authored
[AMDGPU] V_SET_INACTIVE optimizations (#98864)
Optimize V_SET_INACTIVE by allow it to run in WWM. Hence WWM sections are not broken up for inactive lane setting. WWM V_SET_INACTIVE can typically be lower to V_CNDMASK. Some cases require use of exec manipulation V_MOV as previous code. GFX9 sees slight instruction count increase in edge cases due to smaller constant bus. Additionally avoid introducing exec manipulation and V_MOVs where a source of V_SET_INACTIVE is the destination. This is a common pattern as WWM register pre-allocation often assigns the same register.
1 parent d18ca27 commit 16cda01

25 files changed

+3205
-4278
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 157 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2098,8 +2098,22 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
20982098
}
20992099
}
21002100

2101+
Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) {
2102+
assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
2103+
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64);
2104+
for (auto &Op : MI.implicit_operands()) {
2105+
if (Op.isDef())
2106+
continue;
2107+
Register OpReg = Op.getReg();
2108+
if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
2109+
OpReg == AMDGPU::SCC)
2110+
continue;
2111+
return OpReg;
2112+
}
2113+
return Register();
2114+
}
2115+
21012116
bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2102-
const SIRegisterInfo *TRI = ST.getRegisterInfo();
21032117
MachineBasicBlock &MBB = *MI.getParent();
21042118
DebugLoc DL = MBB.findDebugLoc(MI);
21052119
switch (MI.getOpcode()) {
@@ -2273,37 +2287,147 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
22732287
MI.eraseFromParent();
22742288
break;
22752289
}
2276-
case AMDGPU::V_SET_INACTIVE_B32: {
2277-
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2278-
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2279-
// FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2280-
// optimizations (mainly Register Coalescer) aware of WWM register liveness.
2281-
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2282-
.add(MI.getOperand(1));
2283-
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2284-
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2285-
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2286-
.add(MI.getOperand(2));
2287-
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2288-
.addReg(Exec);
2289-
MI.eraseFromParent();
2290-
break;
2291-
}
2290+
case AMDGPU::V_SET_INACTIVE_B32:
22922291
case AMDGPU::V_SET_INACTIVE_B64: {
22932292
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2294-
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2295-
MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2296-
MI.getOperand(0).getReg())
2297-
.add(MI.getOperand(1));
2298-
expandPostRAPseudo(*Copy);
2299-
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2300-
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2301-
Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2302-
MI.getOperand(0).getReg())
2303-
.add(MI.getOperand(2));
2304-
expandPostRAPseudo(*Copy);
2305-
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2306-
.addReg(Exec);
2293+
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2294+
unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
2295+
? AMDGPU::V_MOV_B64_PSEUDO
2296+
: AMDGPU::V_MOV_B32_e32;
2297+
Register ExecReg = RI.getExec();
2298+
Register DstReg = MI.getOperand(0).getReg();
2299+
MachineOperand &ActiveSrc = MI.getOperand(1);
2300+
MachineOperand &InactiveSrc = MI.getOperand(2);
2301+
2302+
// Find implicit register defining lanes active outside WWM.
2303+
Register ExecSrcReg = findSetInactiveMask(MI);
2304+
assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
2305+
// Note: default here is set to ExecReg so that functional MIR is still
2306+
// generated if implicit def is not found and assertions are disabled.
2307+
if (!ExecSrcReg)
2308+
ExecSrcReg = ExecReg;
2309+
2310+
// Ideally in WWM this operation is lowered to V_CNDMASK; however,
2311+
// constant bus constraints and the presence of literal constants
2312+
// present an issue.
2313+
// Fallback to V_MOV base lowering in all but the common cases.
2314+
const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2315+
MachineFunction *MF = MBB.getParent();
2316+
MachineRegisterInfo &MRI = MF->getRegInfo();
2317+
const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2318+
const MCInstrDesc &Desc = get(Opcode);
2319+
2320+
const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
2321+
const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
2322+
const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
2323+
const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
2324+
const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
2325+
const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
2326+
2327+
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2328+
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2329+
2330+
int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
2331+
int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
2332+
int ConstantBusUses =
2333+
1 + // Starts at 1 for ExecSrcReg
2334+
(usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
2335+
(usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
2336+
int LiteralConstants =
2337+
((ActiveSrc.isReg() ||
2338+
(ActiveSrc.isImm() && isInlineConstant(ActiveImm)))
2339+
? 0
2340+
: 1) +
2341+
((InactiveSrc.isReg() ||
2342+
(InactiveSrc.isImm() && isInlineConstant(InactiveImm)))
2343+
? 0
2344+
: 1);
2345+
2346+
bool UseVCndMask =
2347+
ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
2348+
if (VMov64 && UseVCndMask) {
2349+
// Decomposition must not introduce new literals.
2350+
UseVCndMask &=
2351+
ActiveSrc.isReg() ||
2352+
(isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) ||
2353+
(!isInlineConstant(ActiveImm));
2354+
UseVCndMask &= InactiveSrc.isReg() ||
2355+
(isInlineConstant(InactiveImmLo) &&
2356+
isInlineConstant(InactiveImmHi)) ||
2357+
(!isInlineConstant(InactiveImm));
2358+
}
2359+
2360+
if (UseVCndMask && VMov64) {
2361+
// Dual V_CNDMASK_B32
2362+
MachineOperand ActiveLo = buildExtractSubRegOrImm(
2363+
MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr);
2364+
MachineOperand ActiveHi = buildExtractSubRegOrImm(
2365+
MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr);
2366+
MachineOperand InactiveLo = buildExtractSubRegOrImm(
2367+
MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr);
2368+
MachineOperand InactiveHi = buildExtractSubRegOrImm(
2369+
MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr);
2370+
if (ActiveSrc.isReg())
2371+
ActiveHi.setIsKill(ActiveSrc.isKill());
2372+
if (InactiveSrc.isReg())
2373+
InactiveHi.setIsKill(InactiveSrc.isKill());
2374+
BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0))
2375+
.addImm(0)
2376+
.add(InactiveLo)
2377+
.addImm(0)
2378+
.add(ActiveLo)
2379+
.addReg(ExecSrcReg)
2380+
.addReg(DstReg, RegState::ImplicitDefine);
2381+
BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1))
2382+
.addImm(0)
2383+
.add(InactiveHi)
2384+
.addImm(0)
2385+
.add(ActiveHi)
2386+
.addReg(ExecSrcReg)
2387+
.addReg(DstReg, RegState::ImplicitDefine);
2388+
} else if (UseVCndMask) {
2389+
// Single V_CNDMASK_B32
2390+
BuildMI(MBB, MI, DL, Desc, DstReg)
2391+
.addImm(0)
2392+
.add(InactiveSrc)
2393+
.addImm(0)
2394+
.add(ActiveSrc)
2395+
.addReg(ExecSrcReg);
2396+
} else {
2397+
// Fallback V_MOV case.
2398+
// Avoid unnecessary work if a source VGPR is also the destination.
2399+
// This can happen if WWM register allocation was efficient.
2400+
// Note: this assumes WWM execution.
2401+
bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
2402+
bool DstIsInactive =
2403+
InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
2404+
if (!DstIsInactive) {
2405+
// Set exec mask to inactive lanes,
2406+
// but only if active lanes would be overwritten.
2407+
if (DstIsActive) {
2408+
BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
2409+
.addReg(ExecSrcReg)
2410+
.setOperandDead(3); // Dead scc
2411+
}
2412+
// Copy inactive lanes
2413+
MachineInstr *VMov =
2414+
BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
2415+
if (VMov64)
2416+
expandPostRAPseudo(*VMov);
2417+
}
2418+
if (!DstIsActive) {
2419+
// Set exec mask to active lanes
2420+
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
2421+
// Copy active lanes
2422+
MachineInstr *VMov =
2423+
BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
2424+
.add(ActiveSrc);
2425+
if (VMov64)
2426+
expandPostRAPseudo(*VMov);
2427+
}
2428+
// Restore WWM
2429+
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
2430+
}
23072431
MI.eraseFromParent();
23082432
break;
23092433
}
@@ -5647,6 +5771,9 @@ unsigned SIInstrInfo::buildExtractSubReg(
56475771
MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
56485772
const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
56495773
unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5774+
if (!SuperReg.getReg().isVirtual())
5775+
return RI.getSubReg(SuperReg.getReg(), SubIdx);
5776+
56505777
MachineBasicBlock *MBB = MI->getParent();
56515778
DebugLoc DL = MI->getDebugLoc();
56525779
Register SubReg = MRI.createVirtualRegister(SubRC);

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1437,6 +1437,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
14371437
// This is used if an operand is a 32 bit register but needs to be aligned
14381438
// regardless.
14391439
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
1440+
1441+
static Register findSetInactiveMask(const MachineInstr &MI);
14401442
};
14411443

14421444
/// \brief Returns true if a reg:subreg pair P has a TRC class

0 commit comments

Comments
 (0)