Skip to content

Commit e55d6f5

Browse files
authored
[AMDGPU] Simplify and improve codegen for llvm.amdgcn.set.inactive (#107889)
Always generate v_cndmask_b32 instead of modifying exec around v_mov_b32. This is expected to be faster because modifying exec generally causes pipeline stalls.
1 parent 30fbfe5 commit e55d6f5

25 files changed

+4167
-4458
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5439,6 +5439,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54395439

54405440
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
54415441
IID == Intrinsic::amdgcn_permlanex16;
5442+
bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5443+
IID == Intrinsic::amdgcn_set_inactive_chain_arg;
54425444

54435445
auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
54445446
Register Src2, LLT VT) -> Register {
@@ -5448,6 +5450,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54485450
case Intrinsic::amdgcn_permlane64:
54495451
return LaneOp.getReg(0);
54505452
case Intrinsic::amdgcn_readlane:
5453+
case Intrinsic::amdgcn_set_inactive:
5454+
case Intrinsic::amdgcn_set_inactive_chain_arg:
54515455
return LaneOp.addUse(Src1).getReg(0);
54525456
case Intrinsic::amdgcn_writelane:
54535457
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
@@ -5472,7 +5476,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54725476
Register Src0 = MI.getOperand(2).getReg();
54735477
Register Src1, Src2;
54745478
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5475-
IsPermLane16) {
5479+
IsSetInactive || IsPermLane16) {
54765480
Src1 = MI.getOperand(3).getReg();
54775481
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
54785482
Src2 = MI.getOperand(4).getReg();
@@ -5490,7 +5494,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54905494
if (Size < 32) {
54915495
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
54925496

5493-
if (IsPermLane16)
5497+
if (IsSetInactive || IsPermLane16)
54945498
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
54955499

54965500
if (IID == Intrinsic::amdgcn_writelane)
@@ -5526,7 +5530,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
55265530
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
55275531
MachineInstrBuilder Src1Parts, Src2Parts;
55285532

5529-
if (IsPermLane16)
5533+
if (IsSetInactive || IsPermLane16)
55305534
Src1Parts = B.buildUnmerge(PartialResTy, Src1);
55315535

55325536
if (IID == Intrinsic::amdgcn_writelane)
@@ -5535,7 +5539,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
55355539
for (unsigned i = 0; i < NumParts; ++i) {
55365540
Src0 = Src0Parts.getReg(i);
55375541

5538-
if (IsPermLane16)
5542+
if (IsSetInactive || IsPermLane16)
55395543
Src1 = Src1Parts.getReg(i);
55405544

55415545
if (IID == Intrinsic::amdgcn_writelane)
@@ -7496,6 +7500,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
74967500
case Intrinsic::amdgcn_permlane16:
74977501
case Intrinsic::amdgcn_permlanex16:
74987502
case Intrinsic::amdgcn_permlane64:
7503+
case Intrinsic::amdgcn_set_inactive:
7504+
case Intrinsic::amdgcn_set_inactive_chain_arg:
74997505
return legalizeLaneOp(Helper, MI, IntrID);
75007506
case Intrinsic::amdgcn_s_buffer_prefetch_data:
75017507
return legalizeSBufferPrefetch(Helper, MI);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6102,6 +6102,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61026102
unsigned IID = N->getConstantOperandVal(0);
61036103
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
61046104
IID == Intrinsic::amdgcn_permlanex16;
6105+
bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6106+
IID == Intrinsic::amdgcn_set_inactive_chain_arg;
61056107
SDLoc SL(N);
61066108
MVT IntVT = MVT::getIntegerVT(ValSize);
61076109

@@ -6119,6 +6121,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61196121
Operands.push_back(Src2);
61206122
[[fallthrough]];
61216123
case Intrinsic::amdgcn_readlane:
6124+
case Intrinsic::amdgcn_set_inactive:
6125+
case Intrinsic::amdgcn_set_inactive_chain_arg:
61226126
Operands.push_back(Src1);
61236127
[[fallthrough]];
61246128
case Intrinsic::amdgcn_readfirstlane:
@@ -6145,7 +6149,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61456149
SDValue Src0 = N->getOperand(1);
61466150
SDValue Src1, Src2;
61476151
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6148-
IsPermLane16) {
6152+
IsSetInactive || IsPermLane16) {
61496153
Src1 = N->getOperand(2);
61506154
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
61516155
Src2 = N->getOperand(3);
@@ -6161,7 +6165,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61616165
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
61626166
SL, MVT::i32);
61636167

6164-
if (IsPermLane16) {
6168+
if (IsSetInactive || IsPermLane16) {
61656169
Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
61666170
SL, MVT::i32);
61676171
}
@@ -6237,7 +6241,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62376241
Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
62386242
DAG.getConstant(EltIdx, SL, MVT::i32));
62396243

6240-
if (IsPermLane16)
6244+
if (IsSetInactive || IsPermLane16)
62416245
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
62426246
DAG.getConstant(EltIdx, SL, MVT::i32));
62436247

@@ -6246,7 +6250,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62466250
DAG.getConstant(EltIdx, SL, MVT::i32));
62476251

62486252
Pieces.push_back(
6249-
IsPermLane16
6253+
IsSetInactive || IsPermLane16
62506254
? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
62516255
: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
62526256
EltIdx += 2;
@@ -6262,7 +6266,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62626266
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
62636267
Src0 = DAG.getBitcast(VecVT, Src0);
62646268

6265-
if (IsPermLane16)
6269+
if (IsSetInactive || IsPermLane16)
62666270
Src1 = DAG.getBitcast(VecVT, Src1);
62676271

62686272
if (IID == Intrinsic::amdgcn_writelane)
@@ -8745,6 +8749,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
87458749
case Intrinsic::amdgcn_permlane16:
87468750
case Intrinsic::amdgcn_permlanex16:
87478751
case Intrinsic::amdgcn_permlane64:
8752+
case Intrinsic::amdgcn_set_inactive:
8753+
case Intrinsic::amdgcn_set_inactive_chain_arg:
87488754
return lowerLaneOp(*this, Op.getNode(), DAG);
87498755
default:
87508756
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 8 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -2097,21 +2097,6 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
20972097
}
20982098
}
20992099

2100-
Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) {
2101-
assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
2102-
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64);
2103-
for (auto &Op : MI.implicit_operands()) {
2104-
if (Op.isDef())
2105-
continue;
2106-
Register OpReg = Op.getReg();
2107-
if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
2108-
OpReg == AMDGPU::SCC)
2109-
continue;
2110-
return OpReg;
2111-
}
2112-
return Register();
2113-
}
2114-
21152100
bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
21162101
MachineBasicBlock &MBB = *MI.getParent();
21172102
DebugLoc DL = MBB.findDebugLoc(MI);
@@ -2286,147 +2271,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
22862271
MI.eraseFromParent();
22872272
break;
22882273
}
2289-
case AMDGPU::V_SET_INACTIVE_B32:
2290-
case AMDGPU::V_SET_INACTIVE_B64: {
2291-
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2292-
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2293-
unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
2294-
? AMDGPU::V_MOV_B64_PSEUDO
2295-
: AMDGPU::V_MOV_B32_e32;
2296-
Register ExecReg = RI.getExec();
2274+
case AMDGPU::V_SET_INACTIVE_B32: {
2275+
// Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
22972276
Register DstReg = MI.getOperand(0).getReg();
2298-
MachineOperand &ActiveSrc = MI.getOperand(1);
2299-
MachineOperand &InactiveSrc = MI.getOperand(2);
2300-
2301-
// Find implicit register defining lanes active outside WWM.
2302-
Register ExecSrcReg = findSetInactiveMask(MI);
2303-
assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
2304-
// Note: default here is set to ExecReg so that functional MIR is still
2305-
// generated if implicit def is not found and assertions are disabled.
2306-
if (!ExecSrcReg)
2307-
ExecSrcReg = ExecReg;
2308-
2309-
// Ideally in WWM this operation is lowered to V_CNDMASK; however,
2310-
// constant bus constraints and the presence of literal constants
2311-
// present an issue.
2312-
// Fallback to V_MOV base lowering in all but the common cases.
2313-
const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2314-
MachineFunction *MF = MBB.getParent();
2315-
MachineRegisterInfo &MRI = MF->getRegInfo();
2316-
const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2317-
const MCInstrDesc &Desc = get(Opcode);
2318-
2319-
const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
2320-
const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
2321-
const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
2322-
const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
2323-
const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
2324-
const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
2325-
2326-
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2327-
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2328-
2329-
int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
2330-
int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
2331-
int ConstantBusUses =
2332-
1 + // Starts at 1 for ExecSrcReg
2333-
(usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
2334-
(usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
2335-
int LiteralConstants =
2336-
((ActiveSrc.isReg() ||
2337-
(ActiveSrc.isImm() && isInlineConstant(ActiveImm)))
2338-
? 0
2339-
: 1) +
2340-
((InactiveSrc.isReg() ||
2341-
(InactiveSrc.isImm() && isInlineConstant(InactiveImm)))
2342-
? 0
2343-
: 1);
2344-
2345-
bool UseVCndMask =
2346-
ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
2347-
if (VMov64 && UseVCndMask) {
2348-
// Decomposition must not introduce new literals.
2349-
UseVCndMask &=
2350-
ActiveSrc.isReg() ||
2351-
(isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) ||
2352-
(!isInlineConstant(ActiveImm));
2353-
UseVCndMask &= InactiveSrc.isReg() ||
2354-
(isInlineConstant(InactiveImmLo) &&
2355-
isInlineConstant(InactiveImmHi)) ||
2356-
(!isInlineConstant(InactiveImm));
2357-
}
2358-
2359-
if (UseVCndMask && VMov64) {
2360-
// Dual V_CNDMASK_B32
2361-
MachineOperand ActiveLo = buildExtractSubRegOrImm(
2362-
MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr);
2363-
MachineOperand ActiveHi = buildExtractSubRegOrImm(
2364-
MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr);
2365-
MachineOperand InactiveLo = buildExtractSubRegOrImm(
2366-
MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr);
2367-
MachineOperand InactiveHi = buildExtractSubRegOrImm(
2368-
MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr);
2369-
if (ActiveSrc.isReg())
2370-
ActiveHi.setIsKill(ActiveSrc.isKill());
2371-
if (InactiveSrc.isReg())
2372-
InactiveHi.setIsKill(InactiveSrc.isKill());
2373-
BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0))
2374-
.addImm(0)
2375-
.add(InactiveLo)
2376-
.addImm(0)
2377-
.add(ActiveLo)
2378-
.addReg(ExecSrcReg)
2379-
.addReg(DstReg, RegState::ImplicitDefine);
2380-
BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1))
2381-
.addImm(0)
2382-
.add(InactiveHi)
2383-
.addImm(0)
2384-
.add(ActiveHi)
2385-
.addReg(ExecSrcReg)
2386-
.addReg(DstReg, RegState::ImplicitDefine);
2387-
} else if (UseVCndMask) {
2388-
// Single V_CNDMASK_B32
2389-
BuildMI(MBB, MI, DL, Desc, DstReg)
2390-
.addImm(0)
2391-
.add(InactiveSrc)
2392-
.addImm(0)
2393-
.add(ActiveSrc)
2394-
.addReg(ExecSrcReg);
2395-
} else {
2396-
// Fallback V_MOV case.
2397-
// Avoid unnecessary work if a source VGPR is also the destination.
2398-
// This can happen if WWM register allocation was efficient.
2399-
// Note: this assumes WWM execution.
2400-
bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
2401-
bool DstIsInactive =
2402-
InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
2403-
if (!DstIsInactive) {
2404-
// Set exec mask to inactive lanes,
2405-
// but only if active lanes would be overwritten.
2406-
if (DstIsActive) {
2407-
BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
2408-
.addReg(ExecSrcReg)
2409-
.setOperandDead(3); // Dead scc
2410-
}
2411-
// Copy inactive lanes
2412-
MachineInstr *VMov =
2413-
BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
2414-
if (VMov64)
2415-
expandPostRAPseudo(*VMov);
2416-
}
2417-
if (!DstIsActive) {
2418-
// Set exec mask to active lanes
2419-
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
2420-
// Copy active lanes
2421-
MachineInstr *VMov =
2422-
BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
2423-
.add(ActiveSrc);
2424-
if (VMov64)
2425-
expandPostRAPseudo(*VMov);
2426-
}
2427-
// Restore WWM
2428-
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
2429-
}
2277+
BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2278+
.add(MI.getOperand(3))
2279+
.add(MI.getOperand(4))
2280+
.add(MI.getOperand(1))
2281+
.add(MI.getOperand(2))
2282+
.add(MI.getOperand(5));
24302283
MI.eraseFromParent();
24312284
break;
24322285
}

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1437,8 +1437,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
14371437
// This is used if an operand is a 32 bit register but needs to be aligned
14381438
// regardless.
14391439
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
1440-
1441-
static Register findSetInactiveMask(const MachineInstr &MI);
14421440
};
14431441

14441442
/// \brief Returns true if a reg:subreg pair P has a TRC class

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -243,29 +243,16 @@ def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))),
243243

244244
// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
245245
// restoring it after we're done.
246-
let Defs = [SCC], isConvergent = 1 in {
247-
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
248-
(ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
249-
250-
def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
251-
(ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
252-
} // End Defs = [SCC]
246+
let isConvergent = 1 in
247+
def V_SET_INACTIVE_B32 : VOP3_Pseudo<"v_set_inactive_b32", VOP2e_I32_I32_I32_I1>;
253248

254249
foreach vt = Reg32Types.types in {
255250
def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
256-
(V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
257-
}
258-
259-
foreach vt = Reg64Types.types in {
260-
def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
261-
(V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
251+
(V_SET_INACTIVE_B32 0, VSrc_b32:$src, 0, VSrc_b32:$inactive, (IMPLICIT_DEF))>;
262252
}
263253

264254
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
265-
(V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
266-
267-
def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)),
268-
(V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>;
255+
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
269256

270257
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
271258
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),

llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,7 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
215215
for (MachineBasicBlock *MBB : RPOT) {
216216
bool InWWM = false;
217217
for (MachineInstr &MI : *MBB) {
218-
if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
219-
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
218+
if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32)
220219
RegsAssigned |= processDef(MI.getOperand(0));
221220

222221
if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) {

0 commit comments

Comments
 (0)