Skip to content

Commit 6f1d22d

Browse files
committed
[AMDGPU][AMDGPUDemoteSCCBranchToExecz] Implementation: demote s_cbranch_scc branches into vcmp + s_cbranch_execz branches
1 parent c43d1d6 commit 6f1d22d

File tree

5 files changed

+403
-120
lines changed

5 files changed

+403
-120
lines changed

llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp

Lines changed: 258 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,271 @@
11
#include <llvm/CodeGen/MachineFunctionPass.h>
22

33
#include "AMDGPU.h"
4+
#include "GCNSubtarget.h"
5+
#include "SIInstrInfo.h"
6+
#include "SIRegisterInfo.h"
47

58
using namespace llvm;
69

710
namespace {
811
#define DEBUG_TYPE "amdgpu-demote-scc-to-execz"
9-
const char PassName[] = "AMDGPU if conversion";
12+
const char PassName[] = "AMDGPU s_cbranch_scc to s_cbranch_execz conversion";
13+
14+
std::optional<unsigned> getVALUOpc(const MachineInstr &MI,
15+
bool Reverse = false) {
16+
unsigned Opc = MI.getOpcode();
17+
if (Reverse) {
18+
switch (Opc) {
19+
case AMDGPU::S_CMP_EQ_I32:
20+
Opc = AMDGPU::S_CMP_LG_I32;
21+
break;
22+
case AMDGPU::S_CMP_LG_I32:
23+
Opc = AMDGPU::S_CMP_EQ_I32;
24+
break;
25+
case AMDGPU::S_CMP_GT_I32:
26+
Opc = AMDGPU::S_CMP_LE_I32;
27+
break;
28+
case AMDGPU::S_CMP_GE_I32:
29+
Opc = AMDGPU::S_CMP_LT_I32;
30+
break;
31+
case AMDGPU::S_CMP_LT_I32:
32+
Opc = AMDGPU::S_CMP_GE_I32;
33+
break;
34+
case AMDGPU::S_CMP_LE_I32:
35+
Opc = AMDGPU::S_CMP_GT_I32;
36+
break;
37+
case AMDGPU::S_CMP_EQ_U32:
38+
Opc = AMDGPU::S_CMP_LG_U32;
39+
break;
40+
case AMDGPU::S_CMP_LG_U32:
41+
Opc = AMDGPU::S_CMP_EQ_U32;
42+
break;
43+
case AMDGPU::S_CMP_GT_U32:
44+
Opc = AMDGPU::S_CMP_LE_U32;
45+
break;
46+
case AMDGPU::S_CMP_GE_U32:
47+
Opc = AMDGPU::S_CMP_LT_U32;
48+
break;
49+
case AMDGPU::S_CMP_LT_U32:
50+
Opc = AMDGPU::S_CMP_GE_U32;
51+
break;
52+
case AMDGPU::S_CMP_LE_U32:
53+
Opc = AMDGPU::S_CMP_GT_U32;
54+
break;
55+
case AMDGPU::S_CMP_EQ_U64:
56+
Opc = AMDGPU::S_CMP_LG_U64;
57+
break;
58+
case AMDGPU::S_CMP_LG_U64:
59+
Opc = AMDGPU::S_CMP_EQ_U64;
60+
break;
61+
default:
62+
return std::nullopt;
63+
}
64+
}
65+
66+
switch (Opc) {
67+
case AMDGPU::S_CMP_EQ_I32:
68+
return AMDGPU::V_CMP_EQ_I32_e64;
69+
case AMDGPU::S_CMP_LG_I32:
70+
return AMDGPU::V_CMP_LT_I32_e64;
71+
case AMDGPU::S_CMP_GT_I32:
72+
return AMDGPU::V_CMP_GT_I32_e64;
73+
case AMDGPU::S_CMP_GE_I32:
74+
return AMDGPU::V_CMP_GE_I32_e64;
75+
case AMDGPU::S_CMP_LT_I32:
76+
return AMDGPU::V_CMP_LT_I32_e64;
77+
case AMDGPU::S_CMP_LE_I32:
78+
return AMDGPU::V_CMP_LE_I32_e64;
79+
case AMDGPU::S_CMP_EQ_U32:
80+
return AMDGPU::V_CMP_EQ_U32_e64;
81+
case AMDGPU::S_CMP_LG_U32:
82+
return AMDGPU::V_CMP_NE_U32_e64;
83+
case AMDGPU::S_CMP_GT_U32:
84+
return AMDGPU::V_CMP_GT_U32_e64;
85+
case AMDGPU::S_CMP_GE_U32:
86+
return AMDGPU::V_CMP_GE_U32_e64;
87+
case AMDGPU::S_CMP_LT_U32:
88+
return AMDGPU::V_CMP_LT_U32_e64;
89+
case AMDGPU::S_CMP_LE_U32:
90+
return AMDGPU::V_CMP_LE_U32_e64;
91+
case AMDGPU::S_CMP_EQ_U64:
92+
return AMDGPU::V_CMP_EQ_U64_e64;
93+
case AMDGPU::S_CMP_LG_U64:
94+
return AMDGPU::V_CMP_NE_U64_e64;
95+
default:
96+
return std::nullopt;
97+
}
98+
}
99+
100+
bool isSCmpPromotableToVCmp(const MachineInstr &MI) {
101+
return getVALUOpc(MI).has_value();
102+
}
103+
104+
bool isTriangular(MachineBasicBlock &Head, MachineBasicBlock *&Then,
105+
MachineBasicBlock *&Tail) {
106+
if (Head.succ_size() != 2)
107+
return false;
108+
109+
Then = Head.succ_begin()[0];
110+
Tail = Head.succ_begin()[1];
111+
112+
// Canonicalize so Succ0 has MBB as its single predecessor.
113+
if (Then->pred_size() != 1)
114+
std::swap(Then, Tail);
115+
116+
if (Then->pred_size() != 1 || Then->succ_size() != 1)
117+
return false;
118+
119+
return *Then->succ_begin() == Tail;
120+
}
121+
122+
bool hasPromotableCmpConditon(MachineInstr &Term, MachineInstr *&Cmp) {
123+
auto CmpIt = std::next(Term.getReverseIterator());
124+
if (CmpIt == Term.getParent()->instr_rend())
125+
return false;
126+
127+
if (!isSCmpPromotableToVCmp(*CmpIt))
128+
return false;
129+
130+
Cmp = &*CmpIt;
131+
return true;
132+
}
133+
134+
bool hasCbranchSCCTerm(MachineBasicBlock &Head, MachineInstr *&Term) {
135+
auto TermIt = Head.getFirstInstrTerminator();
136+
if (TermIt == Head.end())
137+
return false;
138+
139+
switch (TermIt->getOpcode()) {
140+
case AMDGPU::S_CBRANCH_SCC0:
141+
case AMDGPU::S_CBRANCH_SCC1:
142+
Term = &*TermIt;
143+
return true;
144+
default:
145+
return false;
146+
}
147+
}
148+
149+
bool isTriangularSCCBranch(MachineBasicBlock &Head, MachineInstr *&Term,
150+
MachineInstr *&Cmp, MachineBasicBlock *&Then,
151+
MachineBasicBlock *&Tail) {
152+
153+
if (!hasCbranchSCCTerm(Head, Term))
154+
return false;
155+
156+
if (!isTriangular(Head, Then, Tail))
157+
return false;
158+
159+
// phi-nodes in the tail can prevent splicing the instructions of the then
160+
// and tail blocks in the head
161+
if (!Tail->empty() && Tail->begin()->isPHI())
162+
return false;
163+
164+
if (!hasPromotableCmpConditon(*Term, Cmp))
165+
return false;
166+
167+
return true;
168+
}
169+
170+
bool SCC1JumpsToThen(const MachineInstr &Term, const MachineBasicBlock &Then) {
171+
MachineBasicBlock *TBB = Term.getOperand(0).getMBB();
172+
return (TBB == &Then) == (Term.getOpcode() == AMDGPU::S_CBRANCH_SCC1);
173+
}
10174

11175
class AMDGPUDemoteSCCBranchToExecz {
176+
MachineFunction &MF;
177+
const GCNSubtarget &ST;
178+
const SIInstrInfo &TII;
179+
const SIRegisterInfo &RegInfo;
180+
const TargetSchedModel &SchedModel;
181+
12182
public:
13-
AMDGPUDemoteSCCBranchToExecz() = default;
183+
AMDGPUDemoteSCCBranchToExecz(MachineFunction &MF)
184+
: MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
185+
RegInfo(*ST.getRegisterInfo()), SchedModel(TII.getSchedModel()) {}
186+
187+
bool mustRetainSCCBranch(const MachineInstr &Term, const MachineInstr &Cmp,
188+
const MachineBasicBlock &Then,
189+
const MachineBasicBlock &Tail) {
190+
bool IsWave32 = TII.isWave32();
191+
unsigned AndSaveExecOpc =
192+
IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
193+
unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
194+
unsigned NewOps[] = {*getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then)),
195+
AndSaveExecOpc, Mov};
196+
unsigned NewOpsCost = 0;
197+
for (unsigned Opc : NewOps)
198+
NewOpsCost += SchedModel.computeInstrLatency(Opc);
199+
unsigned OldCmpCost = SchedModel.computeInstrLatency(&Cmp, false);
200+
201+
assert(NewOpsCost >= OldCmpCost);
202+
return !TII.mustRetainExeczBranch(*Term.getParent(), Then, Tail,
203+
NewOpsCost - OldCmpCost);
204+
}
205+
206+
void demoteCmp(MachineInstr &Term, MachineInstr &Cmp, MachineBasicBlock &Head,
207+
MachineBasicBlock &Then, MachineBasicBlock &Tail) {
208+
unsigned NewCmpOpc = *getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then));
209+
Cmp.setDesc(TII.get(NewCmpOpc));
210+
211+
MachineOperand L = Cmp.getOperand(0);
212+
MachineOperand R = Cmp.getOperand(1);
213+
for (unsigned i = 3; i != 0; --i)
214+
Cmp.removeOperand(i - 1);
14215

15-
bool run() { return false; }
216+
auto VCC = RegInfo.getVCC();
217+
auto Exec = RegInfo.getExec();
218+
219+
auto &MRI = MF.getRegInfo();
220+
MCRegister ExecBackup =
221+
MRI.createVirtualRegister(RegInfo.getPhysRegBaseClass(Exec));
222+
223+
Cmp.addOperand(MachineOperand::CreateReg(VCC, true));
224+
Cmp.addOperand(L);
225+
Cmp.addOperand(R);
226+
Cmp.addImplicitDefUseOperands(MF);
227+
228+
TII.legalizeOperands(Cmp);
229+
230+
bool IsWave32 = TII.isWave32();
231+
unsigned AndSaveExecOpc =
232+
IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
233+
auto SaveAndMaskExec = BuildMI(*Term.getParent(), Term, Cmp.getDebugLoc(),
234+
TII.get(AndSaveExecOpc), ExecBackup);
235+
SaveAndMaskExec.addReg(VCC, RegState::Kill);
236+
SaveAndMaskExec->getOperand(3).setIsDead(); // mark SCC as dead
237+
238+
DebugLoc DL = Term.getDebugLoc();
239+
TII.removeBranch(Head);
240+
MachineOperand Cond[] = {
241+
MachineOperand::CreateImm(SIInstrInfo::BranchPredicate::EXECZ),
242+
MachineOperand::CreateReg(RegInfo.getExec(), false)};
243+
TII.insertBranch(Head, &Tail, &Then, Cond, DL);
244+
245+
TII.restoreExec(MF, Tail, Tail.instr_begin(), DebugLoc(), ExecBackup);
246+
}
247+
248+
bool run() {
249+
if (!SchedModel.hasInstrSchedModel())
250+
return false;
251+
bool Changed = false;
252+
253+
for (MachineBasicBlock &Head : MF) {
254+
MachineInstr *Term;
255+
MachineInstr *Cmp;
256+
MachineBasicBlock *Then;
257+
MachineBasicBlock *Tail;
258+
if (!isTriangularSCCBranch(Head, Term, Cmp, Then, Tail))
259+
continue;
260+
261+
if (!mustRetainSCCBranch(*Term, *Cmp, *Then, *Tail))
262+
continue;
263+
264+
demoteCmp(*Term, *Cmp, Head, *Then, *Tail);
265+
Changed = true;
266+
}
267+
return Changed;
268+
}
16269
};
17270

18271
class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
@@ -22,7 +275,7 @@ class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
22275
AMDGPUDemoteSCCBranchToExeczLegacy() : MachineFunctionPass(ID) {}
23276

24277
bool runOnMachineFunction(MachineFunction &MF) override {
25-
AMDGPUDemoteSCCBranchToExecz IfCvt{};
278+
AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
26279
return IfCvt.run();
27280
}
28281

@@ -39,7 +292,7 @@ char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0;
39292

40293
PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run(
41294
MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
42-
AMDGPUDemoteSCCBranchToExecz IfCvt{};
295+
AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
43296
if (!IfCvt.run())
44297
return PreservedAnalyses::all();
45298
return PreservedAnalyses::none();

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4115,6 +4115,96 @@ bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
41154115
return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
41164116
}
41174117

4118+
namespace {
4119+
class BranchWeightCostModel {
4120+
const SIInstrInfo &TII;
4121+
const TargetSchedModel &SchedModel;
4122+
BranchProbability BranchProb;
4123+
uint64_t BranchCost;
4124+
uint64_t ThenCyclesCost;
4125+
4126+
public:
4127+
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
4128+
const MachineBasicBlock &Succ,
4129+
unsigned ExtraTransformationCosts = 0)
4130+
: TII(TII), SchedModel(TII.getSchedModel()),
4131+
ThenCyclesCost(ExtraTransformationCosts) {
4132+
assert(SchedModel.hasInstrSchedModelOrItineraries());
4133+
4134+
const MachineBasicBlock &Head = *Branch.getParent();
4135+
const auto *FromIt = find(Head.successors(), &Succ);
4136+
assert(FromIt != Head.succ_end());
4137+
4138+
BranchProb = Head.getSuccProbability(FromIt);
4139+
if (BranchProb.isUnknown())
4140+
return;
4141+
4142+
BranchCost = SchedModel.computeInstrLatency(&Branch, false);
4143+
}
4144+
4145+
bool isUnknown() const { return BranchProb.isUnknown(); }
4146+
4147+
bool isProfitable(const MachineInstr &MI) {
4148+
assert(!isUnknown());
4149+
4150+
if (TII.isWaitcnt(MI.getOpcode()))
4151+
return false;
4152+
4153+
ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);
4154+
4155+
// Consider `P = N/D` to be the probability of execnz being true
4156+
// The transformation is profitable if always executing the 'then' block
4157+
// is cheaper than executing sometimes 'then' and always
4158+
// executing s_cbranch_execnz:
4159+
// * ThenCost + Extra <= P*ThenCost + BranchCost
4160+
// * (1-P) * (ThenCost + Extra) <= BranchCost
4161+
// * (D-N)/D * (ThenCost + Extra) <= BranchCost
4162+
uint64_t Numerator = BranchProb.getNumerator();
4163+
uint64_t Denominator = BranchProb.getDenominator();
4164+
return (Denominator - Numerator) * ThenCyclesCost <=
4165+
Denominator * BranchCost;
4166+
}
4167+
};
4168+
} // namespace
4169+
4170+
bool SIInstrInfo::mustRetainExeczBranch(
4171+
const MachineBasicBlock &Head, const MachineBasicBlock &From,
4172+
const MachineBasicBlock &To, unsigned ExtraTransformationCosts) const {
4173+
4174+
const auto *FromIt = find(Head.successors(), &From);
4175+
assert(FromIt != Head.succ_end());
4176+
4177+
BranchWeightCostModel CostModel{*this, *Head.getFirstTerminator(), From,
4178+
ExtraTransformationCosts};
4179+
if (CostModel.isUnknown())
4180+
return true;
4181+
4182+
const MachineFunction *MF = From.getParent();
4183+
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
4184+
MBBI != End && MBBI != ToI; ++MBBI) {
4185+
const MachineBasicBlock &MBB = *MBBI;
4186+
4187+
for (const MachineInstr &MI : MBB) {
4188+
// When a uniform loop is inside non-uniform control flow, the branch
4189+
// leaving the loop might never be taken when EXEC = 0.
4190+
// Hence we should retain cbranch out of the loop lest it become infinite.
4191+
if (MI.isConditionalBranch())
4192+
return true;
4193+
4194+
if (MI.isMetaInstruction())
4195+
continue;
4196+
4197+
if (hasUnwantedEffectsWhenEXECEmpty(MI))
4198+
return true;
4199+
4200+
if (!CostModel.isProfitable(MI))
4201+
return true;
4202+
}
4203+
}
4204+
4205+
return false;
4206+
}
4207+
41184208
bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
41194209
unsigned Opcode = MI.getOpcode();
41204210

0 commit comments

Comments
 (0)