1
1
#include < llvm/CodeGen/MachineFunctionPass.h>
2
2
3
3
#include " AMDGPU.h"
4
+ #include " AMDGPUDemoteSCCBranchToExecz.h"
5
+ #include " GCNSubtarget.h"
6
+ #include " SIInstrInfo.h"
7
+ #include " SIRegisterInfo.h"
4
8
5
9
using namespace llvm ;
6
10
7
11
namespace {
8
12
#define DEBUG_TYPE " amdgpu-demote-scc-to-execz"
9
- const char PassName[] = " AMDGPU if conversion" ;
13
+ const char PassName[] = " AMDGPU s_cbranch_scc to s_cbranch_execz conversion" ;
14
+
15
+ std::optional<unsigned > getVALUOpc (const MachineInstr &MI,
16
+ bool Reverse = false ) {
17
+ unsigned Opc = MI.getOpcode ();
18
+ if (Reverse) {
19
+ switch (Opc) {
20
+ case AMDGPU::S_CMP_EQ_I32:
21
+ Opc = AMDGPU::S_CMP_LG_I32;
22
+ break ;
23
+ case AMDGPU::S_CMP_LG_I32:
24
+ Opc = AMDGPU::S_CMP_EQ_I32;
25
+ break ;
26
+ case AMDGPU::S_CMP_GT_I32:
27
+ Opc = AMDGPU::S_CMP_LE_I32;
28
+ break ;
29
+ case AMDGPU::S_CMP_GE_I32:
30
+ Opc = AMDGPU::S_CMP_LT_I32;
31
+ break ;
32
+ case AMDGPU::S_CMP_LT_I32:
33
+ Opc = AMDGPU::S_CMP_GE_I32;
34
+ break ;
35
+ case AMDGPU::S_CMP_LE_I32:
36
+ Opc = AMDGPU::S_CMP_GT_I32;
37
+ break ;
38
+ case AMDGPU::S_CMP_EQ_U32:
39
+ Opc = AMDGPU::S_CMP_LG_U32;
40
+ break ;
41
+ case AMDGPU::S_CMP_LG_U32:
42
+ Opc = AMDGPU::S_CMP_EQ_U32;
43
+ break ;
44
+ case AMDGPU::S_CMP_GT_U32:
45
+ Opc = AMDGPU::S_CMP_LE_U32;
46
+ break ;
47
+ case AMDGPU::S_CMP_GE_U32:
48
+ Opc = AMDGPU::S_CMP_LT_U32;
49
+ break ;
50
+ case AMDGPU::S_CMP_LT_U32:
51
+ Opc = AMDGPU::S_CMP_GE_U32;
52
+ break ;
53
+ case AMDGPU::S_CMP_LE_U32:
54
+ Opc = AMDGPU::S_CMP_GT_U32;
55
+ break ;
56
+ case AMDGPU::S_CMP_EQ_U64:
57
+ Opc = AMDGPU::S_CMP_LG_U64;
58
+ break ;
59
+ case AMDGPU::S_CMP_LG_U64:
60
+ Opc = AMDGPU::S_CMP_EQ_U64;
61
+ break ;
62
+ default :
63
+ return std::nullopt;
64
+ }
65
+ }
66
+
67
+ switch (Opc) {
68
+ case AMDGPU::S_CMP_EQ_I32:
69
+ return AMDGPU::V_CMP_EQ_I32_e64;
70
+ case AMDGPU::S_CMP_LG_I32:
71
+ return AMDGPU::V_CMP_LT_I32_e64;
72
+ case AMDGPU::S_CMP_GT_I32:
73
+ return AMDGPU::V_CMP_GT_I32_e64;
74
+ case AMDGPU::S_CMP_GE_I32:
75
+ return AMDGPU::V_CMP_GE_I32_e64;
76
+ case AMDGPU::S_CMP_LT_I32:
77
+ return AMDGPU::V_CMP_LT_I32_e64;
78
+ case AMDGPU::S_CMP_LE_I32:
79
+ return AMDGPU::V_CMP_LE_I32_e64;
80
+ case AMDGPU::S_CMP_EQ_U32:
81
+ return AMDGPU::V_CMP_EQ_U32_e64;
82
+ case AMDGPU::S_CMP_LG_U32:
83
+ return AMDGPU::V_CMP_NE_U32_e64;
84
+ case AMDGPU::S_CMP_GT_U32:
85
+ return AMDGPU::V_CMP_GT_U32_e64;
86
+ case AMDGPU::S_CMP_GE_U32:
87
+ return AMDGPU::V_CMP_GE_U32_e64;
88
+ case AMDGPU::S_CMP_LT_U32:
89
+ return AMDGPU::V_CMP_LT_U32_e64;
90
+ case AMDGPU::S_CMP_LE_U32:
91
+ return AMDGPU::V_CMP_LE_U32_e64;
92
+ case AMDGPU::S_CMP_EQ_U64:
93
+ return AMDGPU::V_CMP_EQ_U64_e64;
94
+ case AMDGPU::S_CMP_LG_U64:
95
+ return AMDGPU::V_CMP_NE_U64_e64;
96
+ default :
97
+ return std::nullopt;
98
+ }
99
+ }
100
+
101
+ bool isSCmpPromotableToVCmp (const MachineInstr &MI) {
102
+ return getVALUOpc (MI).has_value ();
103
+ }
104
+
105
+ bool isTriangular (MachineBasicBlock &Head, MachineBasicBlock *&Then,
106
+ MachineBasicBlock *&Tail) {
107
+ if (Head.succ_size () != 2 )
108
+ return false ;
109
+
110
+ Then = Head.succ_begin ()[0 ];
111
+ Tail = Head.succ_begin ()[1 ];
112
+
113
+ // Canonicalize so Succ0 has MBB as its single predecessor.
114
+ if (Then->pred_size () != 1 )
115
+ std::swap (Then, Tail);
116
+
117
+ if (Then->pred_size () != 1 || Then->succ_size () != 1 )
118
+ return false ;
119
+
120
+ return *Then->succ_begin () == Tail;
121
+ }
122
+
123
+ bool hasPromotableCmpConditon (MachineInstr &Term, MachineInstr *&Cmp) {
124
+ auto CmpIt = std::next (Term.getReverseIterator ());
125
+ if (CmpIt == Term.getParent ()->instr_rend ())
126
+ return false ;
127
+
128
+ if (!isSCmpPromotableToVCmp (*CmpIt))
129
+ return false ;
130
+
131
+ Cmp = &*CmpIt;
132
+ return true ;
133
+ }
134
+
135
+ bool hasCbranchSCCTerm (MachineBasicBlock &Head, MachineInstr *&Term) {
136
+ auto TermIt = Head.getFirstInstrTerminator ();
137
+ if (TermIt == Head.end ())
138
+ return false ;
139
+
140
+ switch (TermIt->getOpcode ()) {
141
+ case AMDGPU::S_CBRANCH_SCC0:
142
+ case AMDGPU::S_CBRANCH_SCC1:
143
+ Term = &*TermIt;
144
+ return true ;
145
+ default :
146
+ return false ;
147
+ }
148
+ }
149
+
150
+ bool isTriangularSCCBranch (MachineBasicBlock &Head, MachineInstr *&Term,
151
+ MachineInstr *&Cmp, MachineBasicBlock *&Then,
152
+ MachineBasicBlock *&Tail) {
153
+
154
+ if (!hasCbranchSCCTerm (Head, Term))
155
+ return false ;
156
+
157
+ if (!isTriangular (Head, Then, Tail))
158
+ return false ;
159
+
160
+ // phi-nodes in the tail can prevent splicing the instructions of the then
161
+ // and tail blocks in the head
162
+ if (!Tail->empty () && Tail->begin ()->isPHI ())
163
+ return false ;
164
+
165
+ if (!hasPromotableCmpConditon (*Term, Cmp))
166
+ return false ;
167
+
168
+ return true ;
169
+ }
170
+
171
+ bool SCC1JumpsToThen (const MachineInstr &Term, const MachineBasicBlock &Then) {
172
+ MachineBasicBlock *TBB = Term.getOperand (0 ).getMBB ();
173
+ return (TBB == &Then) == (Term.getOpcode () == AMDGPU::S_CBRANCH_SCC1);
174
+ }
10
175
11
176
class AMDGPUDemoteSCCBranchToExecz {
177
+ MachineFunction &MF;
178
+ const GCNSubtarget &ST;
179
+ const SIInstrInfo &TII;
180
+ const SIRegisterInfo &RegInfo;
181
+ const TargetSchedModel &SchedModel;
182
+
12
183
public:
13
- AMDGPUDemoteSCCBranchToExecz () = default ;
184
+ AMDGPUDemoteSCCBranchToExecz (MachineFunction &MF)
185
+ : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
186
+ RegInfo (*ST.getRegisterInfo()), SchedModel(TII.getSchedModel()) {}
187
+
188
+ bool mustRetainSCCBranch (const MachineInstr &Term, const MachineInstr &Cmp,
189
+ const MachineBasicBlock &Then,
190
+ const MachineBasicBlock &Tail) {
191
+ bool IsWave32 = TII.isWave32 ();
192
+ unsigned AndSaveExecOpc =
193
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
194
+ unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
195
+ unsigned NewOps[] = {*getVALUOpc (Cmp, !SCC1JumpsToThen (Term, Then)),
196
+ AndSaveExecOpc, Mov};
197
+ unsigned NewOpsCost = 0 ;
198
+ for (unsigned Opc : NewOps)
199
+ NewOpsCost += SchedModel.computeInstrLatency (Opc);
200
+ unsigned OldCmpCost = SchedModel.computeInstrLatency (&Cmp, false );
201
+
202
+ assert (NewOpsCost >= OldCmpCost);
203
+ return !TII.mustRetainExeczBranch (Term, Then, Tail,
204
+ NewOpsCost - OldCmpCost);
205
+ }
206
+
207
+ void demoteCmp (MachineInstr &Term, MachineInstr &Cmp, MachineBasicBlock &Head,
208
+ MachineBasicBlock &Then, MachineBasicBlock &Tail) {
209
+ unsigned NewCmpOpc = *getVALUOpc (Cmp, !SCC1JumpsToThen (Term, Then));
210
+ Cmp.setDesc (TII.get (NewCmpOpc));
211
+
212
+ MachineOperand L = Cmp.getOperand (0 );
213
+ MachineOperand R = Cmp.getOperand (1 );
214
+ for (unsigned i = 3 ; i != 0 ; --i)
215
+ Cmp.removeOperand (i - 1 );
14
216
15
- bool run () { return false ; }
217
+ auto VCC = RegInfo.getVCC ();
218
+ auto Exec = RegInfo.getExec ();
219
+
220
+ auto &MRI = MF.getRegInfo ();
221
+ MCRegister ExecBackup =
222
+ MRI.createVirtualRegister (RegInfo.getPhysRegBaseClass (Exec));
223
+
224
+ Cmp.addOperand (MachineOperand::CreateReg (VCC, true ));
225
+ Cmp.addOperand (L);
226
+ Cmp.addOperand (R);
227
+ Cmp.addImplicitDefUseOperands (MF);
228
+
229
+ TII.legalizeOperands (Cmp);
230
+
231
+ bool IsWave32 = TII.isWave32 ();
232
+ unsigned AndSaveExecOpc =
233
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
234
+ auto SaveAndMaskExec = BuildMI (*Term.getParent (), Term, Cmp.getDebugLoc (),
235
+ TII.get (AndSaveExecOpc), ExecBackup);
236
+ SaveAndMaskExec.addReg (VCC, RegState::Kill);
237
+ SaveAndMaskExec->getOperand (3 ).setIsDead (); // mark SCC as dead
238
+
239
+ DebugLoc DL = Term.getDebugLoc ();
240
+ TII.removeBranch (Head);
241
+ MachineOperand Cond[] = {
242
+ MachineOperand::CreateImm (SIInstrInfo::BranchPredicate::EXECZ),
243
+ MachineOperand::CreateReg (RegInfo.getExec (), false )};
244
+ TII.insertBranch (Head, &Tail, &Then, Cond, DL);
245
+
246
+ TII.restoreExec (MF, Tail, Tail.instr_begin (), DebugLoc (), ExecBackup);
247
+ }
248
+
249
+ bool run () {
250
+ if (!SchedModel.hasInstrSchedModel ())
251
+ return false ;
252
+ bool Changed = false ;
253
+
254
+ for (MachineBasicBlock &Head : MF) {
255
+ MachineInstr *Term;
256
+ MachineInstr *Cmp;
257
+ MachineBasicBlock *Then;
258
+ MachineBasicBlock *Tail;
259
+ if (!isTriangularSCCBranch (Head, Term, Cmp, Then, Tail))
260
+ continue ;
261
+
262
+ if (!mustRetainSCCBranch (*Term, *Cmp, *Then, *Tail))
263
+ continue ;
264
+
265
+ demoteCmp (*Term, *Cmp, Head, *Then, *Tail);
266
+ Changed = true ;
267
+ }
268
+ return Changed;
269
+ }
16
270
};
17
271
18
272
class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
@@ -22,7 +276,7 @@ class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
22
276
AMDGPUDemoteSCCBranchToExeczLegacy () : MachineFunctionPass(ID) {}
23
277
24
278
bool runOnMachineFunction (MachineFunction &MF) override {
25
- AMDGPUDemoteSCCBranchToExecz IfCvt{};
279
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF };
26
280
return IfCvt.run ();
27
281
}
28
282
@@ -39,7 +293,7 @@ char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0;
39
293
40
294
PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run (
41
295
MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
42
- AMDGPUDemoteSCCBranchToExecz IfCvt{};
296
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF };
43
297
if (!IfCvt.run ())
44
298
return PreservedAnalyses::all ();
45
299
return PreservedAnalyses::none ();
0 commit comments