Skip to content

Commit 4626fb4

Browse files
committed
[AMDGPU][AMDGPUDemoteSCCBranchToExecz] Implementation: demote s_cbranch_scc branches into vcmp + s_cbranch_execz branches
1 parent cc8ceab commit 4626fb4

File tree

5 files changed

+339
-112
lines changed

5 files changed

+339
-112
lines changed

llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp

Lines changed: 202 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,215 @@
22

33
#include "AMDGPU.h"
44
#include "AMDGPUDemoteSCCBranchToExecz.h"
5+
#include "GCNSubtarget.h"
6+
#include "SIInstrInfo.h"
7+
#include "SIRegisterInfo.h"
58

69
using namespace llvm;
710

811
namespace {
912
#define DEBUG_TYPE "amdgpu-demote-scc-to-execz"
10-
const char PassName[] = "AMDGPU if conversion";
13+
const char PassName[] = "AMDGPU s_cbranch_scc to s_cbranch_execz conversion";
14+
15+
std::optional<unsigned> getVALUOpc(const MachineInstr &MI,
16+
bool Reverse = false) {
17+
unsigned Opc = MI.getOpcode();
18+
switch (Opc) {
19+
#define HandleOpcAndReverse(Opc, ReverseOpc, VOpc, ReverseVOpc) \
20+
case Opc: \
21+
return Reverse ? ReverseVOpc : VOpc; \
22+
case ReverseOpc: \
23+
return Reverse ? VOpc : ReverseVOpc
24+
HandleOpcAndReverse(AMDGPU::S_CMP_EQ_I32, AMDGPU::S_CMP_LG_I32,
25+
AMDGPU::V_CMP_EQ_I32_e64, AMDGPU::V_CMP_NE_I32_e64);
26+
HandleOpcAndReverse(AMDGPU::S_CMP_EQ_U32, AMDGPU::S_CMP_LG_U32,
27+
AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_NE_U32_e64);
28+
HandleOpcAndReverse(AMDGPU::S_CMP_GT_I32, AMDGPU::S_CMP_LE_I32,
29+
AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_LE_I32_e64);
30+
HandleOpcAndReverse(AMDGPU::S_CMP_GT_U32, AMDGPU::S_CMP_LE_U32,
31+
AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_LE_U32_e64);
32+
HandleOpcAndReverse(AMDGPU::S_CMP_GE_I32, AMDGPU::S_CMP_LT_I32,
33+
AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_LT_I32_e64);
34+
HandleOpcAndReverse(AMDGPU::S_CMP_GE_U32, AMDGPU::S_CMP_LT_U32,
35+
AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_LT_U32_e64);
36+
HandleOpcAndReverse(AMDGPU::S_CMP_EQ_U64, AMDGPU::S_CMP_LG_U64,
37+
AMDGPU::V_CMP_EQ_U64_e64, AMDGPU::V_CMP_NE_U64_e64);
38+
#undef HandleOpcAndReverse
39+
default:
40+
break;
41+
}
42+
return std::nullopt;
43+
}
44+
45+
bool isSCmpPromotableToVCmp(const MachineInstr &MI) {
46+
return getVALUOpc(MI).has_value();
47+
}
48+
49+
bool isTriangular(MachineBasicBlock &Head, MachineBasicBlock *&Then,
50+
MachineBasicBlock *&Tail) {
51+
if (Head.succ_size() != 2)
52+
return false;
53+
54+
Then = Head.succ_begin()[0];
55+
Tail = Head.succ_begin()[1];
56+
57+
// Canonicalize so Succ0 has MBB as its single predecessor.
58+
if (Then->pred_size() != 1)
59+
std::swap(Then, Tail);
60+
61+
if (Then->pred_size() != 1 || Then->succ_size() != 1)
62+
return false;
63+
64+
return *Then->succ_begin() == Tail;
65+
}
66+
67+
bool hasPromotableCmpConditon(MachineInstr &Term, MachineInstr *&Cmp) {
68+
auto CmpIt = std::next(Term.getReverseIterator());
69+
if (CmpIt == Term.getParent()->instr_rend())
70+
return false;
71+
72+
if (!isSCmpPromotableToVCmp(*CmpIt))
73+
return false;
74+
75+
Cmp = &*CmpIt;
76+
return true;
77+
}
78+
79+
bool hasCbranchSCCTerm(MachineBasicBlock &Head, MachineInstr *&Term) {
80+
auto TermIt = Head.getFirstInstrTerminator();
81+
if (TermIt == Head.end())
82+
return false;
83+
84+
switch (TermIt->getOpcode()) {
85+
case AMDGPU::S_CBRANCH_SCC0:
86+
case AMDGPU::S_CBRANCH_SCC1:
87+
Term = &*TermIt;
88+
return true;
89+
default:
90+
return false;
91+
}
92+
}
93+
94+
bool isTriangularSCCBranch(MachineBasicBlock &Head, MachineInstr *&Term,
95+
MachineInstr *&Cmp, MachineBasicBlock *&Then,
96+
MachineBasicBlock *&Tail) {
97+
98+
if (!hasCbranchSCCTerm(Head, Term))
99+
return false;
100+
101+
bool SCCIsUsedOutsideHead = any_of(
102+
Head.liveouts(), [](const auto &P) { return P.PhysReg == AMDGPU::SCC; });
103+
if (SCCIsUsedOutsideHead)
104+
return false;
105+
106+
if (!isTriangular(Head, Then, Tail))
107+
return false;
108+
109+
// phi-nodes in the tail can prevent splicing the instructions of the then
110+
// and tail blocks in the head
111+
if (!Tail->empty() && Tail->begin()->isPHI())
112+
return false;
113+
114+
if (!hasPromotableCmpConditon(*Term, Cmp))
115+
return false;
116+
117+
return true;
118+
}
119+
120+
bool SCC1JumpsToThen(const MachineInstr &Term, const MachineBasicBlock &Then) {
121+
MachineBasicBlock *TBB = Term.getOperand(0).getMBB();
122+
return (TBB == &Then) == (Term.getOpcode() == AMDGPU::S_CBRANCH_SCC1);
123+
}
11124

12125
class AMDGPUDemoteSCCBranchToExecz {
126+
MachineFunction &MF;
127+
const GCNSubtarget &ST;
128+
const SIInstrInfo &TII;
129+
const SIRegisterInfo &RegInfo;
130+
const TargetSchedModel &SchedModel;
131+
13132
public:
14-
AMDGPUDemoteSCCBranchToExecz() = default;
133+
AMDGPUDemoteSCCBranchToExecz(MachineFunction &MF)
134+
: MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
135+
RegInfo(*ST.getRegisterInfo()), SchedModel(TII.getSchedModel()) {}
136+
137+
bool mustRetainSCCBranch(const MachineInstr &Term, const MachineInstr &Cmp,
138+
const MachineBasicBlock &Then,
139+
const MachineBasicBlock &Tail) {
140+
bool IsWave32 = TII.isWave32();
141+
unsigned AndSaveExecOpc =
142+
IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
143+
unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
144+
unsigned NewOps[] = {*getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then)),
145+
AndSaveExecOpc, Mov};
146+
unsigned NewOpsCost = 0;
147+
for (unsigned Opc : NewOps)
148+
NewOpsCost += SchedModel.computeInstrLatency(Opc);
149+
unsigned OldCmpCost = SchedModel.computeInstrLatency(&Cmp, false);
150+
151+
assert(NewOpsCost >= OldCmpCost);
152+
return !TII.mustRetainExeczBranch(Term, Then, Tail,
153+
NewOpsCost - OldCmpCost);
154+
}
155+
156+
void demoteCmp(MachineInstr &Term, MachineInstr &Cmp, MachineBasicBlock &Head,
157+
MachineBasicBlock &Then, MachineBasicBlock &Tail) {
158+
unsigned NewCmpOpc = *getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then));
159+
Cmp.setDesc(TII.get(NewCmpOpc));
160+
161+
Cmp.removeOperand(2);
162+
163+
auto VCC = RegInfo.getVCC();
164+
auto Exec = RegInfo.getExec();
15165

16-
bool run() { return false; }
166+
auto &MRI = MF.getRegInfo();
167+
MCRegister ExecBackup =
168+
MRI.createVirtualRegister(RegInfo.getPhysRegBaseClass(Exec));
169+
170+
Cmp.insert(Cmp.operands_begin(), MachineOperand::CreateReg(VCC, true));
171+
Cmp.addImplicitDefUseOperands(MF);
172+
173+
TII.legalizeOperands(Cmp);
174+
175+
bool IsWave32 = TII.isWave32();
176+
unsigned AndSaveExecOpc =
177+
IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
178+
auto SaveAndMaskExec = BuildMI(*Term.getParent(), Term, Cmp.getDebugLoc(),
179+
TII.get(AndSaveExecOpc), ExecBackup);
180+
SaveAndMaskExec.addReg(VCC, RegState::Kill);
181+
SaveAndMaskExec->getOperand(3).setIsDead(); // mark SCC as dead
182+
183+
DebugLoc DL = Term.getDebugLoc();
184+
TII.removeBranch(Head);
185+
MachineOperand Cond[] = {
186+
MachineOperand::CreateImm(SIInstrInfo::BranchPredicate::EXECZ),
187+
MachineOperand::CreateReg(RegInfo.getExec(), false)};
188+
TII.insertBranch(Head, &Tail, &Then, Cond, DL);
189+
190+
TII.restoreExec(MF, Tail, Tail.instr_begin(), DebugLoc(), ExecBackup);
191+
}
192+
193+
bool run() {
194+
if (!SchedModel.hasInstrSchedModel())
195+
return false;
196+
bool Changed = false;
197+
198+
for (MachineBasicBlock &Head : MF) {
199+
MachineInstr *Term;
200+
MachineInstr *Cmp;
201+
MachineBasicBlock *Then;
202+
MachineBasicBlock *Tail;
203+
if (!isTriangularSCCBranch(Head, Term, Cmp, Then, Tail))
204+
continue;
205+
206+
if (!mustRetainSCCBranch(*Term, *Cmp, *Then, *Tail))
207+
continue;
208+
209+
demoteCmp(*Term, *Cmp, Head, *Then, *Tail);
210+
Changed = true;
211+
}
212+
return Changed;
213+
}
17214
};
18215

19216
class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
@@ -23,7 +220,7 @@ class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
23220
AMDGPUDemoteSCCBranchToExeczLegacy() : MachineFunctionPass(ID) {}
24221

25222
bool runOnMachineFunction(MachineFunction &MF) override {
26-
AMDGPUDemoteSCCBranchToExecz IfCvt{};
223+
AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
27224
return IfCvt.run();
28225
}
29226

@@ -40,7 +237,7 @@ char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0;
40237

41238
PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run(
42239
MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
43-
AMDGPUDemoteSCCBranchToExecz IfCvt{};
240+
AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
44241
if (!IfCvt.run())
45242
return PreservedAnalyses::all();
46243
return PreservedAnalyses::none();

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4118,6 +4118,88 @@ bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
41184118
return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
41194119
}
41204120

4121+
namespace {
4122+
class BranchWeightCostModel {
4123+
const SIInstrInfo &TII;
4124+
const TargetSchedModel &SchedModel;
4125+
BranchProbability BranchProb;
4126+
static constexpr uint64_t BranchNotTakenCost = 1;
4127+
uint64_t BranchTakenCost;
4128+
uint64_t ThenCyclesCost;
4129+
4130+
public:
4131+
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
4132+
const MachineBasicBlock &Succ,
4133+
unsigned ExtraTransformationCosts)
4134+
: TII(TII), SchedModel(TII.getSchedModel()),
4135+
ThenCyclesCost(ExtraTransformationCosts) {
4136+
const MachineBasicBlock &Head = *Branch.getParent();
4137+
const auto *FromIt = find(Head.successors(), &Succ);
4138+
assert(FromIt != Head.succ_end());
4139+
4140+
BranchProb = Head.getSuccProbability(FromIt);
4141+
if (BranchProb.isUnknown())
4142+
BranchProb = BranchProbability::getZero();
4143+
BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
4144+
}
4145+
4146+
bool isProfitable(const MachineInstr &MI) {
4147+
if (TII.isWaitcnt(MI.getOpcode()))
4148+
return false;
4149+
4150+
ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
4151+
4152+
// Consider `P = N/D` to be the probability of execz being false (skipping
4153+
// the then-block) The transformation is profitable if always executing the
4154+
// 'then' block is cheaper than executing sometimes 'then' and always
4155+
// executing s_cbranch_execz:
4156+
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
4157+
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
4158+
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
4159+
// BranchNotTakenCost
4160+
uint64_t Numerator = BranchProb.getNumerator();
4161+
uint64_t Denominator = BranchProb.getDenominator();
4162+
return (Denominator - Numerator) * ThenCyclesCost <=
4163+
((Denominator - Numerator) * BranchTakenCost +
4164+
Numerator * BranchNotTakenCost);
4165+
}
4166+
};
4167+
} // namespace
4168+
4169+
bool SIInstrInfo::mustRetainExeczBranch(
4170+
const MachineInstr &Branch, const MachineBasicBlock &From,
4171+
const MachineBasicBlock &To, unsigned ExtraTransformationCosts) const {
4172+
4173+
assert(is_contained(Branch.getParent()->successors(), &From));
4174+
BranchWeightCostModel CostModel{*this, Branch, From,
4175+
ExtraTransformationCosts};
4176+
4177+
const MachineFunction *MF = From.getParent();
4178+
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
4179+
MBBI != End && MBBI != ToI; ++MBBI) {
4180+
const MachineBasicBlock &MBB = *MBBI;
4181+
4182+
for (const MachineInstr &MI : MBB) {
4183+
// When a uniform loop is inside non-uniform control flow, the branch
4184+
// leaving the loop might never be taken when EXEC = 0.
4185+
// Hence we should retain cbranch out of the loop lest it become infinite.
4186+
if (MI.isConditionalBranch())
4187+
return true;
4188+
4189+
if (MI.isMetaInstruction())
4190+
continue;
4191+
4192+
if (hasUnwantedEffectsWhenEXECEmpty(MI))
4193+
return true;
4194+
4195+
if (!CostModel.isProfitable(MI))
4196+
return true;
4197+
}
4198+
}
4199+
4200+
return false;
4201+
}
4202+
41214203
bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
41224204
unsigned Opcode = MI.getOpcode();
41234205

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
8787
TargetSchedModel SchedModel;
8888
mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
8989

90+
public:
9091
// The inverse predicate should have the negative value.
9192
enum BranchPredicate {
9293
INVALID_BR = 0,
@@ -98,6 +99,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
9899
EXECZ = 3
99100
};
100101

102+
private:
101103
using SetVectorType = SmallSetVector<MachineInstr *, 32>;
102104

103105
static unsigned getBranchOpcode(BranchPredicate Cond);
@@ -1031,13 +1033,21 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
10311033
/// Return true if the instruction modifies the mode register.q
10321034
static bool modifiesModeRegister(const MachineInstr &MI);
10331035

1036+
/// Returns true if it's protifable to remove an execz branch from Branch to
1037+
/// From
1038+
bool mustRetainExeczBranch(const MachineInstr &Branch,
1039+
const MachineBasicBlock &From,
1040+
const MachineBasicBlock &To,
1041+
unsigned ExtraTransformationCosts = 0) const;
1042+
10341043
/// This function is used to determine if an instruction can be safely
10351044
/// executed under EXEC = 0 without hardware error, indeterminate results,
10361045
/// and/or visible effects on future vector execution or outside the shader.
1037-
/// Note: as of 2024 the only use of this is SIPreEmitPeephole where it is
1038-
/// used in removing branches over short EXEC = 0 sequences.
1039-
/// As such it embeds certain assumptions which may not apply to every case
1040-
/// of EXEC = 0 execution.
1046+
/// Note: as of 2024 the only use of this is SIPreEmitPeephole and
1047+
/// AMDGPUDemoteSCCBranchToExecz (through SIIInstrInfo::mustRetainExeczBranch)
1048+
/// where it is used in removing branches over short EXEC = 0 sequences. As
1049+
/// such it embeds certain assumptions which may not apply to every case of
1050+
/// EXEC = 0 execution.
10411051
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
10421052

10431053
/// Returns true if the instruction could potentially depend on the value of

0 commit comments

Comments
 (0)