2
2
3
3
#include " AMDGPU.h"
4
4
#include " AMDGPUDemoteSCCBranchToExecz.h"
5
+ #include " GCNSubtarget.h"
6
+ #include " SIInstrInfo.h"
7
+ #include " SIRegisterInfo.h"
5
8
6
9
using namespace llvm ;
7
10
8
11
namespace {
9
12
#define DEBUG_TYPE " amdgpu-demote-scc-to-execz"
10
- const char PassName[] = " AMDGPU if conversion" ;
13
+ const char PassName[] = " AMDGPU s_cbranch_scc to s_cbranch_execz conversion" ;
14
+
15
+ std::optional<unsigned > getVALUOpc (const MachineInstr &MI,
16
+ bool Reverse = false ) {
17
+ unsigned Opc = MI.getOpcode ();
18
+ switch (Opc) {
19
+ #define HandleOpcAndReverse (Opc, ReverseOpc, VOpc, ReverseVOpc ) \
20
+ case Opc: \
21
+ return Reverse ? ReverseVOpc : VOpc; \
22
+ case ReverseOpc: \
23
+ return Reverse ? VOpc : ReverseVOpc
24
+ HandleOpcAndReverse (AMDGPU::S_CMP_EQ_I32, AMDGPU::S_CMP_LG_I32,
25
+ AMDGPU::V_CMP_EQ_I32_e64, AMDGPU::V_CMP_NE_I32_e64);
26
+ HandleOpcAndReverse (AMDGPU::S_CMP_EQ_U32, AMDGPU::S_CMP_LG_U32,
27
+ AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_NE_U32_e64);
28
+ HandleOpcAndReverse (AMDGPU::S_CMP_GT_I32, AMDGPU::S_CMP_LE_I32,
29
+ AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_LE_I32_e64);
30
+ HandleOpcAndReverse (AMDGPU::S_CMP_GT_U32, AMDGPU::S_CMP_LE_U32,
31
+ AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_LE_U32_e64);
32
+ HandleOpcAndReverse (AMDGPU::S_CMP_GE_I32, AMDGPU::S_CMP_LT_I32,
33
+ AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_LT_I32_e64);
34
+ HandleOpcAndReverse (AMDGPU::S_CMP_GE_U32, AMDGPU::S_CMP_LT_U32,
35
+ AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_LT_U32_e64);
36
+ HandleOpcAndReverse (AMDGPU::S_CMP_EQ_U64, AMDGPU::S_CMP_LG_U64,
37
+ AMDGPU::V_CMP_EQ_U64_e64, AMDGPU::V_CMP_NE_U64_e64);
38
+ #undef HandleOpcAndReverse
39
+ default :
40
+ break ;
41
+ }
42
+ return std::nullopt;
43
+ }
44
+
45
+ bool isSCmpPromotableToVCmp (const MachineInstr &MI) {
46
+ return getVALUOpc (MI).has_value ();
47
+ }
48
+
49
+ bool isTriangular (MachineBasicBlock &Head, MachineBasicBlock *&Then,
50
+ MachineBasicBlock *&Tail) {
51
+ if (Head.succ_size () != 2 )
52
+ return false ;
53
+
54
+ Then = Head.succ_begin ()[0 ];
55
+ Tail = Head.succ_begin ()[1 ];
56
+
57
+ // Canonicalize so Succ0 has MBB as its single predecessor.
58
+ if (Then->pred_size () != 1 )
59
+ std::swap (Then, Tail);
60
+
61
+ if (Then->pred_size () != 1 || Then->succ_size () != 1 )
62
+ return false ;
63
+
64
+ return *Then->succ_begin () == Tail;
65
+ }
66
+
67
+ bool hasPromotableCmpConditon (MachineInstr &Term, MachineInstr *&Cmp) {
68
+ auto CmpIt = std::next (Term.getReverseIterator ());
69
+ if (CmpIt == Term.getParent ()->instr_rend ())
70
+ return false ;
71
+
72
+ if (!isSCmpPromotableToVCmp (*CmpIt))
73
+ return false ;
74
+
75
+ Cmp = &*CmpIt;
76
+ return true ;
77
+ }
78
+
79
+ bool hasCbranchSCCTerm (MachineBasicBlock &Head, MachineInstr *&Term) {
80
+ auto TermIt = Head.getFirstInstrTerminator ();
81
+ if (TermIt == Head.end ())
82
+ return false ;
83
+
84
+ switch (TermIt->getOpcode ()) {
85
+ case AMDGPU::S_CBRANCH_SCC0:
86
+ case AMDGPU::S_CBRANCH_SCC1:
87
+ Term = &*TermIt;
88
+ return true ;
89
+ default :
90
+ return false ;
91
+ }
92
+ }
93
+
94
+ bool isTriangularSCCBranch (MachineBasicBlock &Head, MachineInstr *&Term,
95
+ MachineInstr *&Cmp, MachineBasicBlock *&Then,
96
+ MachineBasicBlock *&Tail) {
97
+
98
+ if (!hasCbranchSCCTerm (Head, Term))
99
+ return false ;
100
+
101
+ bool SCCIsUsedOutsideHead = any_of (
102
+ Head.liveouts (), [](const auto &P) { return P.PhysReg == AMDGPU::SCC; });
103
+ if (SCCIsUsedOutsideHead)
104
+ return false ;
105
+
106
+ if (!isTriangular (Head, Then, Tail))
107
+ return false ;
108
+
109
+ // phi-nodes in the tail can prevent splicing the instructions of the then
110
+ // and tail blocks in the head
111
+ if (!Tail->empty () && Tail->begin ()->isPHI ())
112
+ return false ;
113
+
114
+ if (!hasPromotableCmpConditon (*Term, Cmp))
115
+ return false ;
116
+
117
+ return true ;
118
+ }
119
+
120
+ bool SCC1JumpsToThen (const MachineInstr &Term, const MachineBasicBlock &Then) {
121
+ MachineBasicBlock *TBB = Term.getOperand (0 ).getMBB ();
122
+ return (TBB == &Then) == (Term.getOpcode () == AMDGPU::S_CBRANCH_SCC1);
123
+ }
11
124
12
125
class AMDGPUDemoteSCCBranchToExecz {
126
+ MachineFunction &MF;
127
+ const GCNSubtarget &ST;
128
+ const SIInstrInfo &TII;
129
+ const SIRegisterInfo &RegInfo;
130
+ const TargetSchedModel &SchedModel;
131
+
13
132
public:
14
- AMDGPUDemoteSCCBranchToExecz () = default ;
133
+ AMDGPUDemoteSCCBranchToExecz (MachineFunction &MF)
134
+ : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
135
+ RegInfo (*ST.getRegisterInfo()), SchedModel(TII.getSchedModel()) {}
136
+
137
+ bool mustRetainSCCBranch (const MachineInstr &Term, const MachineInstr &Cmp,
138
+ const MachineBasicBlock &Then,
139
+ const MachineBasicBlock &Tail) {
140
+ bool IsWave32 = TII.isWave32 ();
141
+ unsigned AndSaveExecOpc =
142
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
143
+ unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
144
+ unsigned NewOps[] = {*getVALUOpc (Cmp, !SCC1JumpsToThen (Term, Then)),
145
+ AndSaveExecOpc, Mov};
146
+ unsigned NewOpsCost = 0 ;
147
+ for (unsigned Opc : NewOps)
148
+ NewOpsCost += SchedModel.computeInstrLatency (Opc);
149
+ unsigned OldCmpCost = SchedModel.computeInstrLatency (&Cmp, false );
150
+
151
+ assert (NewOpsCost >= OldCmpCost);
152
+ return !TII.mustRetainExeczBranch (Term, Then, Tail,
153
+ NewOpsCost - OldCmpCost);
154
+ }
155
+
156
+ void demoteCmp (MachineInstr &Term, MachineInstr &Cmp, MachineBasicBlock &Head,
157
+ MachineBasicBlock &Then, MachineBasicBlock &Tail) {
158
+ unsigned NewCmpOpc = *getVALUOpc (Cmp, !SCC1JumpsToThen (Term, Then));
159
+ Cmp.setDesc (TII.get (NewCmpOpc));
160
+
161
+ Cmp.removeOperand (2 );
162
+
163
+ auto VCC = RegInfo.getVCC ();
164
+ auto Exec = RegInfo.getExec ();
15
165
16
- bool run () { return false ; }
166
+ auto &MRI = MF.getRegInfo ();
167
+ MCRegister ExecBackup =
168
+ MRI.createVirtualRegister (RegInfo.getPhysRegBaseClass (Exec));
169
+
170
+ Cmp.insert (Cmp.operands_begin (), MachineOperand::CreateReg (VCC, true ));
171
+ Cmp.addImplicitDefUseOperands (MF);
172
+
173
+ TII.legalizeOperands (Cmp);
174
+
175
+ bool IsWave32 = TII.isWave32 ();
176
+ unsigned AndSaveExecOpc =
177
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
178
+ auto SaveAndMaskExec = BuildMI (*Term.getParent (), Term, Cmp.getDebugLoc (),
179
+ TII.get (AndSaveExecOpc), ExecBackup);
180
+ SaveAndMaskExec.addReg (VCC, RegState::Kill);
181
+ SaveAndMaskExec->getOperand (3 ).setIsDead (); // mark SCC as dead
182
+
183
+ DebugLoc DL = Term.getDebugLoc ();
184
+ TII.removeBranch (Head);
185
+ MachineOperand Cond[] = {
186
+ MachineOperand::CreateImm (SIInstrInfo::BranchPredicate::EXECZ),
187
+ MachineOperand::CreateReg (RegInfo.getExec (), false )};
188
+ TII.insertBranch (Head, &Tail, &Then, Cond, DL);
189
+
190
+ TII.restoreExec (MF, Tail, Tail.instr_begin (), DebugLoc (), ExecBackup);
191
+ }
192
+
193
+ bool run () {
194
+ if (!SchedModel.hasInstrSchedModel ())
195
+ return false ;
196
+ bool Changed = false ;
197
+
198
+ for (MachineBasicBlock &Head : MF) {
199
+ MachineInstr *Term;
200
+ MachineInstr *Cmp;
201
+ MachineBasicBlock *Then;
202
+ MachineBasicBlock *Tail;
203
+ if (!isTriangularSCCBranch (Head, Term, Cmp, Then, Tail))
204
+ continue ;
205
+
206
+ if (!mustRetainSCCBranch (*Term, *Cmp, *Then, *Tail))
207
+ continue ;
208
+
209
+ demoteCmp (*Term, *Cmp, Head, *Then, *Tail);
210
+ Changed = true ;
211
+ }
212
+ return Changed;
213
+ }
17
214
};
18
215
19
216
class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
@@ -23,7 +220,7 @@ class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
23
220
AMDGPUDemoteSCCBranchToExeczLegacy () : MachineFunctionPass(ID) {}
24
221
25
222
bool runOnMachineFunction (MachineFunction &MF) override {
26
- AMDGPUDemoteSCCBranchToExecz IfCvt{};
223
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF };
27
224
return IfCvt.run ();
28
225
}
29
226
@@ -40,7 +237,7 @@ char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0;
40
237
41
238
PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run (
42
239
MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
43
- AMDGPUDemoteSCCBranchToExecz IfCvt{};
240
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF };
44
241
if (!IfCvt.run ())
45
242
return PreservedAnalyses::all ();
46
243
return PreservedAnalyses::none ();
0 commit comments