1
1
#include < llvm/CodeGen/MachineFunctionPass.h>
2
2
3
3
#include " AMDGPU.h"
4
+ #include " GCNSubtarget.h"
5
+ #include " SIInstrInfo.h"
6
+ #include " SIRegisterInfo.h"
4
7
5
8
using namespace llvm ;
6
9
7
10
namespace {
8
11
#define DEBUG_TYPE " amdgpu-demote-scc-to-execz"
9
- const char PassName[] = " AMDGPU if conversion" ;
12
+ const char PassName[] = " AMDGPU s_cbranch_scc to s_cbranch_execz conversion" ;
13
+
14
+ std::optional<unsigned > getVALUOpc (const MachineInstr &MI,
15
+ bool Reverse = false ) {
16
+ unsigned Opc = MI.getOpcode ();
17
+ if (Reverse) {
18
+ switch (Opc) {
19
+ case AMDGPU::S_CMP_EQ_I32:
20
+ Opc = AMDGPU::S_CMP_LG_I32;
21
+ break ;
22
+ case AMDGPU::S_CMP_LG_I32:
23
+ Opc = AMDGPU::S_CMP_EQ_I32;
24
+ break ;
25
+ case AMDGPU::S_CMP_GT_I32:
26
+ Opc = AMDGPU::S_CMP_LE_I32;
27
+ break ;
28
+ case AMDGPU::S_CMP_GE_I32:
29
+ Opc = AMDGPU::S_CMP_LT_I32;
30
+ break ;
31
+ case AMDGPU::S_CMP_LT_I32:
32
+ Opc = AMDGPU::S_CMP_GE_I32;
33
+ break ;
34
+ case AMDGPU::S_CMP_LE_I32:
35
+ Opc = AMDGPU::S_CMP_GT_I32;
36
+ break ;
37
+ case AMDGPU::S_CMP_EQ_U32:
38
+ Opc = AMDGPU::S_CMP_LG_U32;
39
+ break ;
40
+ case AMDGPU::S_CMP_LG_U32:
41
+ Opc = AMDGPU::S_CMP_EQ_U32;
42
+ break ;
43
+ case AMDGPU::S_CMP_GT_U32:
44
+ Opc = AMDGPU::S_CMP_LE_U32;
45
+ break ;
46
+ case AMDGPU::S_CMP_GE_U32:
47
+ Opc = AMDGPU::S_CMP_LT_U32;
48
+ break ;
49
+ case AMDGPU::S_CMP_LT_U32:
50
+ Opc = AMDGPU::S_CMP_GE_U32;
51
+ break ;
52
+ case AMDGPU::S_CMP_LE_U32:
53
+ Opc = AMDGPU::S_CMP_GT_U32;
54
+ break ;
55
+ case AMDGPU::S_CMP_EQ_U64:
56
+ Opc = AMDGPU::S_CMP_LG_U64;
57
+ break ;
58
+ case AMDGPU::S_CMP_LG_U64:
59
+ Opc = AMDGPU::S_CMP_EQ_U64;
60
+ break ;
61
+ default :
62
+ return std::nullopt;
63
+ }
64
+ }
65
+
66
+ switch (Opc) {
67
+ case AMDGPU::S_CMP_EQ_I32:
68
+ return AMDGPU::V_CMP_EQ_I32_e64;
69
+ case AMDGPU::S_CMP_LG_I32:
70
+ return AMDGPU::V_CMP_LT_I32_e64;
71
+ case AMDGPU::S_CMP_GT_I32:
72
+ return AMDGPU::V_CMP_GT_I32_e64;
73
+ case AMDGPU::S_CMP_GE_I32:
74
+ return AMDGPU::V_CMP_GE_I32_e64;
75
+ case AMDGPU::S_CMP_LT_I32:
76
+ return AMDGPU::V_CMP_LT_I32_e64;
77
+ case AMDGPU::S_CMP_LE_I32:
78
+ return AMDGPU::V_CMP_LE_I32_e64;
79
+ case AMDGPU::S_CMP_EQ_U32:
80
+ return AMDGPU::V_CMP_EQ_U32_e64;
81
+ case AMDGPU::S_CMP_LG_U32:
82
+ return AMDGPU::V_CMP_NE_U32_e64;
83
+ case AMDGPU::S_CMP_GT_U32:
84
+ return AMDGPU::V_CMP_GT_U32_e64;
85
+ case AMDGPU::S_CMP_GE_U32:
86
+ return AMDGPU::V_CMP_GE_U32_e64;
87
+ case AMDGPU::S_CMP_LT_U32:
88
+ return AMDGPU::V_CMP_LT_U32_e64;
89
+ case AMDGPU::S_CMP_LE_U32:
90
+ return AMDGPU::V_CMP_LE_U32_e64;
91
+ case AMDGPU::S_CMP_EQ_U64:
92
+ return AMDGPU::V_CMP_EQ_U64_e64;
93
+ case AMDGPU::S_CMP_LG_U64:
94
+ return AMDGPU::V_CMP_NE_U64_e64;
95
+ default :
96
+ return std::nullopt;
97
+ }
98
+ }
99
+
100
+ bool isSCmpPromotableToVCmp (const MachineInstr &MI) {
101
+ return getVALUOpc (MI).has_value ();
102
+ }
103
+
104
+ bool isTriangular (MachineBasicBlock &Head, MachineBasicBlock *&Then,
105
+ MachineBasicBlock *&Tail) {
106
+ if (Head.succ_size () != 2 )
107
+ return false ;
108
+
109
+ Then = Head.succ_begin ()[0 ];
110
+ Tail = Head.succ_begin ()[1 ];
111
+
112
+ // Canonicalize so Succ0 has MBB as its single predecessor.
113
+ if (Then->pred_size () != 1 )
114
+ std::swap (Then, Tail);
115
+
116
+ if (Then->pred_size () != 1 || Then->succ_size () != 1 )
117
+ return false ;
118
+
119
+ return *Then->succ_begin () == Tail;
120
+ }
121
+
122
+ bool hasPromotableCmpConditon (MachineInstr &Term, MachineInstr *&Cmp) {
123
+ auto CmpIt = std::next (Term.getReverseIterator ());
124
+ if (CmpIt == Term.getParent ()->instr_rend ())
125
+ return false ;
126
+
127
+ if (!isSCmpPromotableToVCmp (*CmpIt))
128
+ return false ;
129
+
130
+ Cmp = &*CmpIt;
131
+ return true ;
132
+ }
133
+
134
+ bool hasCbranchSCCTerm (MachineBasicBlock &Head, MachineInstr *&Term) {
135
+ auto TermIt = Head.getFirstInstrTerminator ();
136
+ if (TermIt == Head.end ())
137
+ return false ;
138
+
139
+ switch (TermIt->getOpcode ()) {
140
+ case AMDGPU::S_CBRANCH_SCC0:
141
+ case AMDGPU::S_CBRANCH_SCC1:
142
+ Term = &*TermIt;
143
+ return true ;
144
+ default :
145
+ return false ;
146
+ }
147
+ }
148
+
149
+ bool isTriangularSCCBranch (MachineBasicBlock &Head, MachineInstr *&Term,
150
+ MachineInstr *&Cmp, MachineBasicBlock *&Then,
151
+ MachineBasicBlock *&Tail) {
152
+
153
+ if (!hasCbranchSCCTerm (Head, Term))
154
+ return false ;
155
+
156
+ if (!isTriangular (Head, Then, Tail))
157
+ return false ;
158
+
159
+ // phi-nodes in the tail can prevent splicing the instructions of the then
160
+ // and tail blocks in the head
161
+ if (!Tail->empty () && Tail->begin ()->isPHI ())
162
+ return false ;
163
+
164
+ if (!hasPromotableCmpConditon (*Term, Cmp))
165
+ return false ;
166
+
167
+ return true ;
168
+ }
169
+
170
+ bool SCC1JumpsToThen (const MachineInstr &Term, const MachineBasicBlock &Then) {
171
+ MachineBasicBlock *TBB = Term.getOperand (0 ).getMBB ();
172
+ return (TBB == &Then) == (Term.getOpcode () == AMDGPU::S_CBRANCH_SCC1);
173
+ }
10
174
11
175
class AMDGPUDemoteSCCBranchToExecz {
176
+ MachineFunction &MF;
177
+ const GCNSubtarget &ST;
178
+ const SIInstrInfo &TII;
179
+ const SIRegisterInfo &RegInfo;
180
+ const TargetSchedModel &SchedModel;
181
+
12
182
public:
13
- AMDGPUDemoteSCCBranchToExecz () = default ;
183
+ AMDGPUDemoteSCCBranchToExecz (MachineFunction &MF)
184
+ : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
185
+ RegInfo (*ST.getRegisterInfo()), SchedModel(TII.getSchedModel()) {}
186
+
187
+ bool mustRetainSCCBranch (const MachineInstr &Term, const MachineInstr &Cmp,
188
+ const MachineBasicBlock &Then,
189
+ const MachineBasicBlock &Tail) {
190
+ bool IsWave32 = TII.isWave32 ();
191
+ unsigned AndSaveExecOpc =
192
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
193
+ unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
194
+ unsigned NewOps[] = {*getVALUOpc (Cmp, !SCC1JumpsToThen (Term, Then)),
195
+ AndSaveExecOpc, Mov};
196
+ unsigned NewOpsCost = 0 ;
197
+ for (unsigned Opc : NewOps)
198
+ NewOpsCost += SchedModel.computeInstrLatency (Opc);
199
+ unsigned OldCmpCost = SchedModel.computeInstrLatency (&Cmp, false );
200
+
201
+ assert (NewOpsCost >= OldCmpCost);
202
+ return !TII.mustRetainExeczBranch (*Term.getParent (), Then, Tail,
203
+ NewOpsCost - OldCmpCost);
204
+ }
205
+
206
+ void demoteCmp (MachineInstr &Term, MachineInstr &Cmp, MachineBasicBlock &Head,
207
+ MachineBasicBlock &Then, MachineBasicBlock &Tail) {
208
+ unsigned NewCmpOpc = *getVALUOpc (Cmp, !SCC1JumpsToThen (Term, Then));
209
+ Cmp.setDesc (TII.get (NewCmpOpc));
210
+
211
+ MachineOperand L = Cmp.getOperand (0 );
212
+ MachineOperand R = Cmp.getOperand (1 );
213
+ for (unsigned i = 3 ; i != 0 ; --i)
214
+ Cmp.removeOperand (i - 1 );
14
215
15
- bool run () { return false ; }
216
+ auto VCC = RegInfo.getVCC ();
217
+ auto Exec = RegInfo.getExec ();
218
+
219
+ auto &MRI = MF.getRegInfo ();
220
+ MCRegister ExecBackup =
221
+ MRI.createVirtualRegister (RegInfo.getPhysRegBaseClass (Exec));
222
+
223
+ Cmp.addOperand (MachineOperand::CreateReg (VCC, true ));
224
+ Cmp.addOperand (L);
225
+ Cmp.addOperand (R);
226
+ Cmp.addImplicitDefUseOperands (MF);
227
+
228
+ TII.legalizeOperands (Cmp);
229
+
230
+ bool IsWave32 = TII.isWave32 ();
231
+ unsigned AndSaveExecOpc =
232
+ IsWave32 ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
233
+ auto SaveAndMaskExec = BuildMI (*Term.getParent (), Term, Cmp.getDebugLoc (),
234
+ TII.get (AndSaveExecOpc), ExecBackup);
235
+ SaveAndMaskExec.addReg (VCC, RegState::Kill);
236
+ SaveAndMaskExec->getOperand (3 ).setIsDead (); // mark SCC as dead
237
+
238
+ DebugLoc DL = Term.getDebugLoc ();
239
+ TII.removeBranch (Head);
240
+ MachineOperand Cond[] = {
241
+ MachineOperand::CreateImm (SIInstrInfo::BranchPredicate::EXECZ),
242
+ MachineOperand::CreateReg (RegInfo.getExec (), false )};
243
+ TII.insertBranch (Head, &Tail, &Then, Cond, DL);
244
+
245
+ TII.restoreExec (MF, Tail, Tail.instr_begin (), DebugLoc (), ExecBackup);
246
+ }
247
+
248
+ bool run () {
249
+ if (!SchedModel.hasInstrSchedModel ())
250
+ return false ;
251
+ bool Changed = false ;
252
+
253
+ for (MachineBasicBlock &Head : MF) {
254
+ MachineInstr *Term;
255
+ MachineInstr *Cmp;
256
+ MachineBasicBlock *Then;
257
+ MachineBasicBlock *Tail;
258
+ if (!isTriangularSCCBranch (Head, Term, Cmp, Then, Tail))
259
+ continue ;
260
+
261
+ if (!mustRetainSCCBranch (*Term, *Cmp, *Then, *Tail))
262
+ continue ;
263
+
264
+ demoteCmp (*Term, *Cmp, Head, *Then, *Tail);
265
+ Changed = true ;
266
+ }
267
+ return Changed;
268
+ }
16
269
};
17
270
18
271
class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
@@ -22,7 +275,7 @@ class AMDGPUDemoteSCCBranchToExeczLegacy : public MachineFunctionPass {
22
275
AMDGPUDemoteSCCBranchToExeczLegacy () : MachineFunctionPass(ID) {}
23
276
24
277
bool runOnMachineFunction (MachineFunction &MF) override {
25
- AMDGPUDemoteSCCBranchToExecz IfCvt{};
278
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF };
26
279
return IfCvt.run ();
27
280
}
28
281
@@ -39,7 +292,7 @@ char AMDGPUDemoteSCCBranchToExeczLegacy::ID = 0;
39
292
40
293
PreservedAnalyses llvm::AMDGPUDemoteSCCBranchToExeczPass::run (
41
294
MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
42
- AMDGPUDemoteSCCBranchToExecz IfCvt{};
295
+ AMDGPUDemoteSCCBranchToExecz IfCvt{MF };
43
296
if (!IfCvt.run ())
44
297
return PreservedAnalyses::all ();
45
298
return PreservedAnalyses::none ();
0 commit comments