Skip to content

Commit e9c1dd1

Browse files
committed
Update approach:
- Always run V_SET_INACTIVE in WWM if it is touched by WWM marking - Lower any V_SET_INACTIVE not touched by marking to COPY - Further optimize the V_SET_INACTIVE expansion based on being in WWM
1 parent a2fea8e commit e9c1dd1

18 files changed

+1863
-2023
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 112 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -2278,15 +2278,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
22782278
? AMDGPU::V_MOV_B64_PSEUDO
22792279
: AMDGPU::V_MOV_B32_e32;
22802280
Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2281-
22822281
Register DstReg = MI.getOperand(0).getReg();
22832282
MachineOperand &ActiveSrc = MI.getOperand(1);
22842283
MachineOperand &InactiveSrc = MI.getOperand(2);
22852284

2286-
bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2287-
2288-
// Find implicit exec src if this is running in WWM.
2289-
Register ExecSrcReg = 0;
2285+
// Find implicit register defining lanes active outside WWM.
2286+
// Note: default here is set to ExecReg so that functional MIR is still
2287+
// generated if implicit def is not found and assertions are disabled.
2288+
Register ExecSrcReg = ExecReg;
22902289
for (auto &Op : MI.implicit_operands()) {
22912290
if (Op.isDef() || !Op.isReg())
22922291
continue;
@@ -2297,120 +2296,135 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
22972296
ExecSrcReg = OpReg;
22982297
break;
22992298
}
2299+
assert(ExecSrcReg != ExecReg &&
2300+
"V_SET_INACTIVE must be in known WWM region");
23002301

23012302
// Ideally in WWM this operation is lowered to V_CNDMASK; however,
23022303
// constant bus constraints and the presence of literal constants
23032304
// present an issue.
23042305
// Fallback to V_MOV base lowering in all but the common cases.
2305-
bool InWWM = !!ExecSrcReg;
2306-
bool UseVCndMask = false;
2307-
if (InWWM) {
2308-
const MachineFunction *MF = MI.getParent()->getParent();
2309-
const MachineRegisterInfo &MRI = MF->getRegInfo();
2310-
const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2311-
const MCInstrDesc &Desc = get(Opcode);
2312-
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2313-
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2314-
int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
2315-
int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
2316-
int ConstantBusUses = 1; // Starts at one for ExecRegSrc
2317-
int LiteralConstants = 0;
2318-
ConstantBusUses +=
2319-
usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0;
2320-
ConstantBusUses +=
2321-
usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0;
2322-
LiteralConstants +=
2323-
ActiveSrc.isImm() &&
2324-
!isInlineConstant(ActiveSrc, Desc.operands()[Src1Idx])
2325-
? 1
2326-
: 0;
2327-
LiteralConstants +=
2328-
InactiveSrc.isImm() &&
2329-
!isInlineConstant(InactiveSrc, Desc.operands()[Src0Idx])
2330-
? 1
2331-
: 0;
2332-
UseVCndMask = ConstantBusUses <= ConstantBusLimit &&
2333-
LiteralConstants <= LiteralLimit &&
2334-
(!VMov64 || (ActiveSrc.isReg() && InactiveSrc.isReg()));
2306+
const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2307+
const MachineFunction *MF = MI.getParent()->getParent();
2308+
const MachineRegisterInfo &MRI = MF->getRegInfo();
2309+
const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2310+
const MCInstrDesc &Desc = get(Opcode);
2311+
2312+
const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
2313+
const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
2314+
const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
2315+
const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
2316+
const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
2317+
const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
2318+
2319+
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2320+
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2321+
2322+
int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
2323+
int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
2324+
int ConstantBusUses =
2325+
1 + // Starts at 1 for ExecSrcReg
2326+
(usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
2327+
(usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
2328+
int LiteralConstants =
2329+
(ActiveSrc.isImm() && !isInlineConstant(ActiveImm) ? 1 : 0) +
2330+
(InactiveSrc.isImm() && !isInlineConstant(InactiveImm) ? 1 : 0);
2331+
2332+
bool UseVCndMask =
2333+
ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
2334+
if (VMov64 && UseVCndMask) {
2335+
// Decomposition must not introduce new literals.
2336+
UseVCndMask &=
2337+
ActiveSrc.isReg() ||
2338+
(isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmLo)) ||
2339+
(!isInlineConstant(ActiveImm));
2340+
UseVCndMask &= InactiveSrc.isReg() ||
2341+
(isInlineConstant(InactiveImmLo) &&
2342+
isInlineConstant(InactiveImmLo)) ||
2343+
(!isInlineConstant(InactiveImm));
23352344
}
23362345

23372346
if (UseVCndMask && VMov64) {
2338-
// WWM B64; decompose to two B32 operations.
2339-
// Test above ensures that both sources are registers.
2340-
// Note: this is done to avoid falling back to V_MOV multiple times
2341-
// and introducing exec manipulation for each VGPR separately.
2342-
assert(ActiveSrc.isReg() && InactiveSrc.isReg());
2343-
Register ActiveLo = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0);
2344-
Register ActiveHi = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1);
2345-
Register InactiveLo = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0);
2346-
Register InactiveHi = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1);
2347-
MachineInstr *Tmp;
2348-
Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
2349-
RI.getSubReg(DstReg, AMDGPU::sub0))
2350-
.addReg(InactiveLo)
2351-
.addReg(ActiveLo)
2352-
.addReg(ExecSrcReg, RegState::Implicit)
2353-
.addReg(DstReg, RegState::ImplicitDefine);
2354-
expandPostRAPseudo(*Tmp);
2355-
Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
2356-
RI.getSubReg(DstReg, AMDGPU::sub1))
2357-
.addReg(InactiveHi, InactiveSrc.isKill() ? RegState::Kill : 0)
2358-
.addReg(ActiveHi, ActiveSrc.isKill() ? RegState::Kill : 0)
2359-
.addReg(ExecSrcReg, RegState::Implicit)
2360-
.addReg(DstReg, RegState::ImplicitDefine);
2361-
expandPostRAPseudo(*Tmp);
2347+
// Dual V_CNDMASK_B32
2348+
MachineOperand ActiveLo =
2349+
ActiveSrc.isReg()
2350+
? MachineOperand::CreateReg(
2351+
RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0), false,
2352+
/*isImp=*/false, /*isKill*/ false)
2353+
: MachineOperand::CreateImm(ActiveImmLo.getSExtValue());
2354+
MachineOperand ActiveHi =
2355+
ActiveSrc.isReg()
2356+
? MachineOperand::CreateReg(
2357+
RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1), false,
2358+
/*isImp=*/false, /*isKill*/ ActiveSrc.isKill())
2359+
: MachineOperand::CreateImm(ActiveImmHi.getSExtValue());
2360+
MachineOperand InactiveLo =
2361+
InactiveSrc.isReg()
2362+
? MachineOperand::CreateReg(
2363+
RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0), false,
2364+
/*isImp=*/false, /*isKill*/ false)
2365+
: MachineOperand::CreateImm(InactiveImmLo.getSExtValue());
2366+
MachineOperand InactiveHi =
2367+
InactiveSrc.isReg()
2368+
? MachineOperand::CreateReg(
2369+
RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1), false,
2370+
/*isImp=*/false, /*isKill*/ InactiveSrc.isKill())
2371+
: MachineOperand::CreateImm(InactiveImmHi.getSExtValue());
2372+
BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub0))
2373+
.addImm(0)
2374+
.add(InactiveLo)
2375+
.addImm(0)
2376+
.add(ActiveLo)
2377+
.addReg(ExecSrcReg)
2378+
.addReg(DstReg, RegState::ImplicitDefine);
2379+
BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub1))
2380+
.addImm(0)
2381+
.add(InactiveHi)
2382+
.addImm(0)
2383+
.add(ActiveHi)
2384+
.addReg(ExecSrcReg)
2385+
.addReg(DstReg, RegState::ImplicitDefine);
23622386
} else if (UseVCndMask) {
2363-
// WWM B32; use V_CNDMASK.
2364-
MachineInstr *VCndMask =
2365-
BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2366-
.addImm(0)
2367-
.add(InactiveSrc)
2368-
.addImm(0)
2369-
.add(ActiveSrc)
2370-
.addReg(ExecSrcReg);
2371-
// Copy implicit defs in case this is part of V_SET_INACTIVE_B64.
2372-
for (auto &Op : MI.implicit_operands()) {
2373-
if (!Op.isDef())
2374-
continue;
2375-
VCndMask->addOperand(Op);
2376-
}
2387+
// Single V_CNDMASK_B32
2388+
BuildMI(MBB, MI, DL, get(Opcode), DstReg)
2389+
.addImm(0)
2390+
.add(InactiveSrc)
2391+
.addImm(0)
2392+
.add(ActiveSrc)
2393+
.addReg(ExecSrcReg);
23772394
} else {
23782395
// Fallback V_MOV case.
2379-
// Avoid unnecessary work if a src is the destination.
2396+
// Avoid unnecessary work if a source VGPR is also the destination.
23802397
// This can happen if WWM register allocation was efficient.
2381-
bool SkipActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
2382-
bool SkipInactive = InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
2383-
if (!SkipActive) {
2384-
if (InWWM) {
2385-
// Cancel WWM
2386-
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
2398+
// Note: this assumes WWM execution.
2399+
bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
2400+
bool DstIsInactive =
2401+
InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
2402+
if (!DstIsInactive) {
2403+
// Set exec mask to inactive lanes,
2404+
// but only if active lanes would be overwritten.
2405+
if (DstIsActive) {
2406+
MachineInstr *ExecMI =
2407+
BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecSrcReg);
2408+
ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
23872409
}
2388-
// Copy active lanes
2410+
// Copy inactive lanes
23892411
MachineInstr *VMov =
2390-
BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
2391-
.add(ActiveSrc);
2412+
BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
23922413
if (VMov64)
23932414
expandPostRAPseudo(*VMov);
23942415
}
2395-
if (!SkipInactive) {
2396-
// Set exec mask to inactive lanes
2397-
MachineInstr *ExecMI = BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
2398-
.addReg(InWWM ? ExecSrcReg : ExecReg);
2399-
ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2400-
// Copy inactive lanes
2416+
if (!DstIsActive) {
2417+
// Set exec mask to active lanes
2418+
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
2419+
// Copy active lanes
24012420
MachineInstr *VMov =
2402-
BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
2421+
BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
2422+
.add(ActiveSrc);
24032423
if (VMov64)
24042424
expandPostRAPseudo(*VMov);
2405-
if (!InWWM) {
2406-
// Restore original exec mask
2407-
BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecReg);
2408-
}
2409-
}
2410-
if (InWWM) {
2411-
// Restore WWM
2412-
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
24132425
}
2426+
// Restore WWM
2427+
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
24142428
}
24152429
MI.eraseFromParent();
24162430
break;

0 commit comments

Comments
 (0)