@@ -21,6 +21,14 @@ using namespace llvm;
21
21
22
22
#define DEBUG_TYPE " si-pre-emit-peephole"
23
23
24
+ static unsigned SkipThreshold;
25
+
26
+ static cl::opt<unsigned , true > SkipThresholdFlag (
27
+ " amdgpu-skip-threshold" , cl::Hidden,
28
+ cl::desc (
29
+ " Number of instructions before jumping over divergent control flow" ),
30
+ cl::location(SkipThreshold), cl::init(12 ));
31
+
24
32
namespace {
25
33
26
34
class SIPreEmitPeephole : public MachineFunctionPass {
@@ -30,6 +38,13 @@ class SIPreEmitPeephole : public MachineFunctionPass {
30
38
31
39
bool optimizeVccBranch (MachineInstr &MI) const ;
32
40
bool optimizeSetGPR (MachineInstr &First, MachineInstr &MI) const ;
41
+ bool getBlockDestinations (MachineBasicBlock &SrcMBB,
42
+ MachineBasicBlock *&TrueMBB,
43
+ MachineBasicBlock *&FalseMBB,
44
+ SmallVectorImpl<MachineOperand> &Cond);
45
+ bool mustRetainExeczBranch (const MachineBasicBlock &From,
46
+ const MachineBasicBlock &To) const ;
47
+ bool removeExeczBranch (MachineInstr &MI, MachineBasicBlock &SrcMBB);
33
48
34
49
public:
35
50
static char ID;
@@ -258,24 +273,97 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
258
273
return true ;
259
274
}
260
275
276
+ bool SIPreEmitPeephole::getBlockDestinations (
277
+ MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
278
+ MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
279
+ if (TII->analyzeBranch (SrcMBB, TrueMBB, FalseMBB, Cond))
280
+ return false ;
281
+
282
+ if (!FalseMBB)
283
+ FalseMBB = SrcMBB.getNextNode ();
284
+
285
+ return true ;
286
+ }
287
+
288
+ bool SIPreEmitPeephole::mustRetainExeczBranch (
289
+ const MachineBasicBlock &From, const MachineBasicBlock &To) const {
290
+ unsigned NumInstr = 0 ;
291
+ const MachineFunction *MF = From.getParent ();
292
+
293
+ for (MachineFunction::const_iterator MBBI (&From), ToI (&To), End = MF->end ();
294
+ MBBI != End && MBBI != ToI; ++MBBI) {
295
+ const MachineBasicBlock &MBB = *MBBI;
296
+
297
+ for (MachineBasicBlock::const_iterator I = MBB.begin (), E = MBB.end ();
298
+ I != E; ++I) {
299
+ // When a uniform loop is inside non-uniform control flow, the branch
300
+ // leaving the loop might never be taken when EXEC = 0.
301
+ // Hence we should retain cbranch out of the loop lest it become infinite.
302
+ if (I->isConditionalBranch ())
303
+ return true ;
304
+
305
+ if (TII->hasUnwantedEffectsWhenEXECEmpty (*I))
306
+ return true ;
307
+
308
+ // These instructions are potentially expensive even if EXEC = 0.
309
+ if (TII->isSMRD (*I) || TII->isVMEM (*I) || TII->isFLAT (*I) ||
310
+ TII->isDS (*I) || I->getOpcode () == AMDGPU::S_WAITCNT)
311
+ return true ;
312
+
313
+ ++NumInstr;
314
+ if (NumInstr >= SkipThreshold)
315
+ return true ;
316
+ }
317
+ }
318
+
319
+ return false ;
320
+ }
321
+
322
+ // Returns true if the skip branch instruction is removed.
323
+ bool SIPreEmitPeephole::removeExeczBranch (MachineInstr &MI,
324
+ MachineBasicBlock &SrcMBB) {
325
+ MachineBasicBlock *TrueMBB = nullptr ;
326
+ MachineBasicBlock *FalseMBB = nullptr ;
327
+ SmallVector<MachineOperand, 1 > Cond;
328
+
329
+ if (!getBlockDestinations (SrcMBB, TrueMBB, FalseMBB, Cond))
330
+ return false ;
331
+
332
+ // Consider only the forward branches.
333
+ if ((SrcMBB.getNumber () >= TrueMBB->getNumber ()) ||
334
+ mustRetainExeczBranch (*FalseMBB, *TrueMBB))
335
+ return false ;
336
+
337
+ LLVM_DEBUG (dbgs () << " Removing the execz branch: " << MI);
338
+ MI.eraseFromParent ();
339
+ SrcMBB.removeSuccessor (TrueMBB);
340
+
341
+ return true ;
342
+ }
343
+
261
344
bool SIPreEmitPeephole::runOnMachineFunction (MachineFunction &MF) {
262
345
const GCNSubtarget &ST = MF.getSubtarget <GCNSubtarget>();
263
346
TII = ST.getInstrInfo ();
264
347
TRI = &TII->getRegisterInfo ();
265
348
MachineBasicBlock *EmptyMBBAtEnd = nullptr ;
266
349
bool Changed = false ;
267
350
351
+ MF.RenumberBlocks ();
352
+
268
353
for (MachineBasicBlock &MBB : MF) {
269
354
MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator ();
270
355
MachineBasicBlock::iterator TermI = MBBE;
271
- // Check first terminator for VCC branches to optimize
356
+ // Check first terminator for branches to optimize
272
357
if (TermI != MBB.end ()) {
273
358
MachineInstr &MI = *TermI;
274
359
switch (MI.getOpcode ()) {
275
360
case AMDGPU::S_CBRANCH_VCCZ:
276
361
case AMDGPU::S_CBRANCH_VCCNZ:
277
362
Changed |= optimizeVccBranch (MI);
278
363
continue ;
364
+ case AMDGPU::S_CBRANCH_EXECZ:
365
+ Changed |= removeExeczBranch (MI, MBB);
366
+ continue ;
279
367
default :
280
368
break ;
281
369
}
0 commit comments