Skip to content

Commit dc24dfa

Browse files
committed
[AMDGPU][AMDGPUDemoteSCCBranchToExecz] Implementation: demote s_cbranch_scc branches into vcmp + s_cbranch_execz branches
Change-Id: I287f101fbb9ee34eac3b048cf5e5f3269707dd52
1 parent 73bb836 commit dc24dfa

File tree

8 files changed

+429
-121
lines changed

8 files changed

+429
-121
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -376,14 +376,6 @@ extern char &AMDGPURemoveIncompatibleFunctionsID;
376376
void initializeAMDGPUDemoteSCCBranchToExeczLegacyPass(PassRegistry &);
377377
extern char &AMDGPUDemoteSCCBranchToExeczLegacyID;
378378

379-
class AMDGPUDemoteSCCBranchToExeczPass
380-
: public PassInfoMixin<AMDGPUDemoteSCCBranchToExeczPass> {
381-
public:
382-
AMDGPUDemoteSCCBranchToExeczPass() = default;
383-
PreservedAnalyses run(MachineFunction &MF,
384-
MachineFunctionAnalysisManager &MFAM);
385-
};
386-
387379
void initializeAMDGPULateCodeGenPrepareLegacyPass(PassRegistry &);
388380
extern char &AMDGPULateCodeGenPrepareLegacyID;
389381

llvm/lib/Target/AMDGPU/AMDGPUDemoteSCCBranchToExecz.cpp

Lines changed: 259 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,272 @@
11
#include <llvm/CodeGen/MachineFunctionPass.h>
22

33
#include "AMDGPU.h"
4+
#include "AMDGPUDemoteSCCBranchToExecz.h"
5+
#include "GCNSubtarget.h"
6+
#include "SIInstrInfo.h"
7+
#include "SIRegisterInfo.h"
48

59
using namespace llvm;
610

711
namespace {
812
#define DEBUG_TYPE "amdgpu-demote-scc-to-execz"
9-
const char PassName[] = "AMDGPU if conversion";
13+
const char PassName[] = "AMDGPU s_cbranch_scc to s_cbranch_execz conversion";
14+
15+
std::optional<unsigned> getVALUOpc(const MachineInstr &MI,
16+
bool Reverse = false) {
17+
unsigned Opc = MI.getOpcode();
18+
if (Reverse) {
19+
switch (Opc) {
20+
case AMDGPU::S_CMP_EQ_I32:
21+
Opc = AMDGPU::S_CMP_LG_I32;
22+
break;
23+
case AMDGPU::S_CMP_LG_I32:
24+
Opc = AMDGPU::S_CMP_EQ_I32;
25+
break;
26+
case AMDGPU::S_CMP_GT_I32:
27+
Opc = AMDGPU::S_CMP_LE_I32;
28+
break;
29+
case AMDGPU::S_CMP_GE_I32:
30+
Opc = AMDGPU::S_CMP_LT_I32;
31+
break;
32+
case AMDGPU::S_CMP_LT_I32:
33+
Opc = AMDGPU::S_CMP_GE_I32;
34+
break;
35+
case AMDGPU::S_CMP_LE_I32:
36+
Opc = AMDGPU::S_CMP_GT_I32;
37+
break;
38+
case AMDGPU::S_CMP_EQ_U32:
39+
Opc = AMDGPU::S_CMP_LG_U32;
40+
break;
41+
case AMDGPU::S_CMP_LG_U32:
42+
Opc = AMDGPU::S_CMP_EQ_U32;
43+
break;
44+
case AMDGPU::S_CMP_GT_U32:
45+
Opc = AMDGPU::S_CMP_LE_U32;
46+
break;
47+
case AMDGPU::S_CMP_GE_U32:
48+
Opc = AMDGPU::S_CMP_LT_U32;
49+
break;
50+
case AMDGPU::S_CMP_LT_U32:
51+
Opc = AMDGPU::S_CMP_GE_U32;
52+
break;
53+
case AMDGPU::S_CMP_LE_U32:
54+
Opc = AMDGPU::S_CMP_GT_U32;
55+
break;
56+
case AMDGPU::S_CMP_EQ_U64:
57+
Opc = AMDGPU::S_CMP_LG_U64;
58+
break;
59+
case AMDGPU::S_CMP_LG_U64:
60+
Opc = AMDGPU::S_CMP_EQ_U64;
61+
break;
62+
default:
63+
return std::nullopt;
64+
}
65+
}
66+
67+
switch (Opc) {
68+
case AMDGPU::S_CMP_EQ_I32:
69+
return AMDGPU::V_CMP_EQ_I32_e64;
70+
case AMDGPU::S_CMP_LG_I32:
71+
return AMDGPU::V_CMP_LT_I32_e64;
72+
case AMDGPU::S_CMP_GT_I32:
73+
return AMDGPU::V_CMP_GT_I32_e64;
74+
case AMDGPU::S_CMP_GE_I32:
75+
return AMDGPU::V_CMP_GE_I32_e64;
76+
case AMDGPU::S_CMP_LT_I32:
77+
return AMDGPU::V_CMP_LT_I32_e64;
78+
case AMDGPU::S_CMP_LE_I32:
79+
return AMDGPU::V_CMP_LE_I32_e64;
80+
case AMDGPU::S_CMP_EQ_U32:
81+
return AMDGPU::V_CMP_EQ_U32_e64;
82+
case AMDGPU::S_CMP_LG_U32:
83+
return AMDGPU::V_CMP_NE_U32_e64;
84+
case AMDGPU::S_CMP_GT_U32:
85+
return AMDGPU::V_CMP_GT_U32_e64;
86+
case AMDGPU::S_CMP_GE_U32:
87+
return AMDGPU::V_CMP_GE_U32_e64;
88+
case AMDGPU::S_CMP_LT_U32:
89+
return AMDGPU::V_CMP_LT_U32_e64;
90+
case AMDGPU::S_CMP_LE_U32:
91+
return AMDGPU::V_CMP_LE_U32_e64;
92+
case AMDGPU::S_CMP_EQ_U64:
93+
return AMDGPU::V_CMP_EQ_U64_e64;
94+
case AMDGPU::S_CMP_LG_U64:
95+
return AMDGPU::V_CMP_NE_U64_e64;
96+
default:
97+
return std::nullopt;
98+
}
99+
}
100+
101+
bool isSCmpPromotableToVCmp(const MachineInstr &MI) {
102+
return getVALUOpc(MI).has_value();
103+
}
104+
105+
bool isTriangular(MachineBasicBlock &Head, MachineBasicBlock *&Then,
106+
MachineBasicBlock *&Tail) {
107+
if (Head.succ_size() != 2)
108+
return false;
109+
110+
Then = Head.succ_begin()[0];
111+
Tail = Head.succ_begin()[1];
112+
113+
// Canonicalize so Succ0 has MBB as its single predecessor.
114+
if (Then->pred_size() != 1)
115+
std::swap(Then, Tail);
116+
117+
if (Then->pred_size() != 1 || Then->succ_size() != 1)
118+
return false;
119+
120+
return *Then->succ_begin() == Tail;
121+
}
122+
123+
bool hasPromotableCmpConditon(MachineInstr &Term, MachineInstr *&Cmp) {
124+
auto CmpIt = std::next(Term.getReverseIterator());
125+
if (CmpIt == Term.getParent()->instr_rend())
126+
return false;
127+
128+
if (!isSCmpPromotableToVCmp(*CmpIt))
129+
return false;
130+
131+
Cmp = &*CmpIt;
132+
return true;
133+
}
134+
135+
bool hasCbranchSCCTerm(MachineBasicBlock &Head, MachineInstr *&Term) {
136+
auto TermIt = Head.getFirstInstrTerminator();
137+
if (TermIt == Head.end())
138+
return false;
139+
140+
switch (TermIt->getOpcode()) {
141+
case AMDGPU::S_CBRANCH_SCC0:
142+
case AMDGPU::S_CBRANCH_SCC1:
143+
Term = &*TermIt;
144+
return true;
145+
default:
146+
return false;
147+
}
148+
}
149+
150+
bool isTriangularSCCBranch(MachineBasicBlock &Head, MachineInstr *&Term,
151+
MachineInstr *&Cmp, MachineBasicBlock *&Then,
152+
MachineBasicBlock *&Tail) {
153+
154+
if (!hasCbranchSCCTerm(Head, Term))
155+
return false;
156+
157+
if (!isTriangular(Head, Then, Tail))
158+
return false;
159+
160+
// phi-nodes in the tail can prevent splicing the instructions of the then
161+
// and tail blocks in the head
162+
if (!Tail->empty() && Tail->begin()->isPHI())
163+
return false;
164+
165+
if (!hasPromotableCmpConditon(*Term, Cmp))
166+
return false;
167+
168+
return true;
169+
}
170+
171+
bool SCC1JumpsToThen(const MachineInstr &Term, const MachineBasicBlock &Then) {
172+
MachineBasicBlock *TBB = Term.getOperand(0).getMBB();
173+
return (TBB == &Then) == (Term.getOpcode() == AMDGPU::S_CBRANCH_SCC1);
174+
}
10175

11176
class AMDGPUDemoteSCCBranchToExecz {
177+
MachineFunction &MF;
178+
const GCNSubtarget &ST;
179+
const SIInstrInfo &TII;
180+
const SIRegisterInfo &RegInfo;
181+
const TargetSchedModel &SchedModel;
182+
12183
public:
13-
AMDGPUDemoteSCCBranchToExecz() = default;
184+
AMDGPUDemoteSCCBranchToExecz(MachineFunction &MF)
185+
: MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
186+
RegInfo(*ST.getRegisterInfo()), SchedModel(TII.getSchedModel()) {}
187+
188+
bool mustRetainSCCBranch(const MachineInstr &Term, const MachineInstr &Cmp,
189+
const MachineBasicBlock &Then,
190+
const MachineBasicBlock &Tail) {
191+
bool IsWave32 = TII.isWave32();
192+
unsigned AndSaveExecOpc =
193+
IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
194+
unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
195+
unsigned NewOps[] = {*getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then)),
196+
AndSaveExecOpc, Mov};
197+
unsigned NewOpsCost = 0;
198+
for (unsigned Opc : NewOps)
199+
NewOpsCost += SchedModel.computeInstrLatency(Opc);
200+
unsigned OldCmpCost = SchedModel.computeInstrLatency(&Cmp, false);
201+
202+
assert(NewOpsCost >= OldCmpCost);
203+
return !TII.mustRetainExeczBranch(Term, Then, Tail,
204+
NewOpsCost - OldCmpCost);
205+
}
206+
207+
void demoteCmp(MachineInstr &Term, MachineInstr &Cmp, MachineBasicBlock &Head,
208+
MachineBasicBlock &Then, MachineBasicBlock &Tail) {
209+
unsigned NewCmpOpc = *getVALUOpc(Cmp, !SCC1JumpsToThen(Term, Then));
210+
Cmp.setDesc(TII.get(NewCmpOpc));
211+
212+
MachineOperand L = Cmp.getOperand(0);
213+
MachineOperand R = Cmp.getOperand(1);
214+
for (unsigned i = 3; i != 0; --i)
215+
Cmp.removeOperand(i - 1);
14216

15-
bool run() { return false; }
217+
auto VCC = RegInfo.getVCC();
218+
auto Exec = RegInfo.getExec();
219+
220+
auto &MRI = MF.getRegInfo();
221+
MCRegister ExecBackup =
222+
MRI.createVirtualRegister(RegInfo.getPhysRegBaseClass(Exec));
223+
224+
Cmp.addOperand(MachineOperand::CreateReg(VCC, true));
225+
Cmp.addOperand(L);
226+
Cmp.addOperand(R);
227+
Cmp.addImplicitDefUseOperands(MF);
228+
229+
TII.legalizeOperands(Cmp);
230+
231+
bool IsWave32 = TII.isWave32();
232+
unsigned AndSaveExecOpc =
233+
IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
234+
auto SaveAndMaskExec = BuildMI(*Term.getParent(), Term, Cmp.getDebugLoc(),
235+
TII.get(AndSaveExecOpc), ExecBackup);
236+
SaveAndMaskExec.addReg(VCC, RegState::Kill);
237+
SaveAndMaskExec->getOperand(3).setIsDead(); // mark SCC as dead
238+
239+
DebugLoc DL = Term.getDebugLoc();
240+
TII.removeBranch(Head);
241+
MachineOperand Cond[] = {
242+
MachineOperand::CreateImm(SIInstrInfo::BranchPredicate::EXECZ),
243+
MachineOperand::CreateReg(RegInfo.getExec(), false)};
244+
TII.insertBranch(Head, &Tail, &Then, Cond, DL);
245+
246+
TII.restoreExec(MF, Tail, Tail.instr_begin(), DebugLoc(), ExecBackup);
247+
}
248+
249+
bool run() {
250+
if (!SchedModel.hasInstrSchedModel())
251+
return false;
252+
bool Changed = false;
253+
254+
for (MachineBasicBlock &Head : MF) {
255+
MachineInstr *Term;
256+
MachineInstr *Cmp;
257+
MachineBasicBlock *Then;
258+
MachineBasicBlock *Tail;
259+
if (!isTriangularSCCBranch(Head, Term, Cmp, Then, Tail))
260+
continue;
261+
262+
if (!mustRetainSCCBranch(*Term, *Cmp, *Then, *Tail))
263+
continue;
264+
265+
demoteCmp(*Term, *Cmp, Head, *Then, *Tail);
266+
Changed = true;
267+
}
268+
return Changed;
269+
}
16270
};
17271

18272
class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
@@ -22,7 +276,7 @@ class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
22276
AMDGPUDemoteSCCBranchToExeczLegacy() : MachineFunctionPass(ID) {}
23277

24278
bool runOnMachineFunction(MachineFunction &MF) override {
25-
AMDGPUDemoteSCCBranchToExecz IfCvt{};
279+
AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
26280
return IfCvt.run();
27281
}
28282

@@ -39,7 +293,7 @@ char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0;
39293

40294
PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run(
41295
MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
42-
AMDGPUDemoteSCCBranchToExecz IfCvt{};
296+
AMDGPUDemoteSCCBranchToExecz IfCvt{MF};
43297
if (!IfCvt.run())
44298
return PreservedAnalyses::all();
45299
return PreservedAnalyses::none();
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
//===- AMDGPURDemoteSCCBranchToExecz.h --- demote s_cbranch_scc -*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// \brief Pass used to demote s_cbranch_scc0/1 branches to s_cbranch_execz
11+
/// branches. These can be later removed by SIPreEmitPeephole.
12+
///
13+
//===----------------------------------------------------------------------===//
14+
15+
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDEMOTESCCBRANCHTOEXECZ_H
16+
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDEMOTESCCBRANCHTOEXECZ_H
17+
18+
#include <llvm/CodeGen/MachineFunction.h>
19+
#include <llvm/IR/PassManager.h>
20+
21+
namespace llvm {
22+
class AMDGPUDemoteSCCBranchToExeczPass
23+
: public PassInfoMixin<AMDGPUDemoteSCCBranchToExeczPass> {
24+
public:
25+
AMDGPUDemoteSCCBranchToExeczPass() = default;
26+
PreservedAnalyses run(MachineFunction &MF,
27+
MachineFunctionAnalysisManager &MFAM);
28+
};
29+
} // namespace llvm
30+
31+
#endif

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "AMDGPU.h"
1919
#include "AMDGPUAliasAnalysis.h"
2020
#include "AMDGPUCtorDtorLowering.h"
21+
#include "AMDGPUDemoteSCCBranchToExecz.h"
2122
#include "AMDGPUExportClustering.h"
2223
#include "AMDGPUIGroupLP.h"
2324
#include "AMDGPUISelDAGToDAG.h"

0 commit comments

Comments
 (0)