Skip to content

Commit 5df2af8

Browse files
committed
[AMDGPU] Merge SIRemoveShortExecBranches into SIPreEmitPeephole
SIRemoveShortExecBranches is an optimisation so fits well in the context of SIPreEmitPeephole. Test changes relate to early termination from kills which have now been lowered prior to considering branches for removal. As these use s_cbranch the execz skips are now retained instead. Currently either behaviour is valid as kill with EXEC=0 is a nop; however, if early termination is used differently in future then the new behaviour is the correct one. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D98917
1 parent 8bc3685 commit 5df2af8

16 files changed

+211
-234
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,6 @@ extern char &SIWholeQuadModeID;
204204
void initializeSILowerControlFlowPass(PassRegistry &);
205205
extern char &SILowerControlFlowID;
206206

207-
void initializeSIRemoveShortExecBranchesPass(PassRegistry &);
208-
extern char &SIRemoveShortExecBranchesID;
209-
210207
void initializeSIPreEmitPeepholePass(PassRegistry &);
211208
extern char &SIPreEmitPeepholeID;
212209

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
249249
initializeSIModeRegisterPass(*PR);
250250
initializeSIWholeQuadModePass(*PR);
251251
initializeSILowerControlFlowPass(*PR);
252-
initializeSIRemoveShortExecBranchesPass(*PR);
253252
initializeSIPreEmitPeepholePass(*PR);
254253
initializeSIInsertSkipsPass(*PR);
255254
initializeSIMemoryLegalizerPass(*PR);
@@ -1215,7 +1214,6 @@ void GCNPassConfig::addPreEmitPass() {
12151214
if (getOptLevel() > CodeGenOpt::None)
12161215
addPass(&SIInsertHardClausesID);
12171216

1218-
addPass(&SIRemoveShortExecBranchesID);
12191217
addPass(&SIInsertSkipsPassID);
12201218
addPass(&SIPreEmitPeepholeID);
12211219
// The hazard recognizer that runs as part of the post-ra scheduler does not

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,6 @@ add_llvm_target(AMDGPUCodeGen
137137
SIPreEmitPeephole.cpp
138138
SIProgramInfo.cpp
139139
SIRegisterInfo.cpp
140-
SIRemoveShortExecBranches.cpp
141140
SIShrinkInstructions.cpp
142141
SIWholeQuadMode.cpp
143142
GCNILPSched.cpp

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@ using namespace llvm;
2121

2222
#define DEBUG_TYPE "si-pre-emit-peephole"
2323

24+
static unsigned SkipThreshold;
25+
26+
static cl::opt<unsigned, true> SkipThresholdFlag(
27+
"amdgpu-skip-threshold", cl::Hidden,
28+
cl::desc(
29+
"Number of instructions before jumping over divergent control flow"),
30+
cl::location(SkipThreshold), cl::init(12));
31+
2432
namespace {
2533

2634
class SIPreEmitPeephole : public MachineFunctionPass {
@@ -30,6 +38,13 @@ class SIPreEmitPeephole : public MachineFunctionPass {
3038

3139
bool optimizeVccBranch(MachineInstr &MI) const;
3240
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
41+
bool getBlockDestinations(MachineBasicBlock &SrcMBB,
42+
MachineBasicBlock *&TrueMBB,
43+
MachineBasicBlock *&FalseMBB,
44+
SmallVectorImpl<MachineOperand> &Cond);
45+
bool mustRetainExeczBranch(const MachineBasicBlock &From,
46+
const MachineBasicBlock &To) const;
47+
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
3348

3449
public:
3550
static char ID;
@@ -258,24 +273,97 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
258273
return true;
259274
}
260275

276+
bool SIPreEmitPeephole::getBlockDestinations(
277+
MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
278+
MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
279+
if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
280+
return false;
281+
282+
if (!FalseMBB)
283+
FalseMBB = SrcMBB.getNextNode();
284+
285+
return true;
286+
}
287+
288+
bool SIPreEmitPeephole::mustRetainExeczBranch(
289+
const MachineBasicBlock &From, const MachineBasicBlock &To) const {
290+
unsigned NumInstr = 0;
291+
const MachineFunction *MF = From.getParent();
292+
293+
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
294+
MBBI != End && MBBI != ToI; ++MBBI) {
295+
const MachineBasicBlock &MBB = *MBBI;
296+
297+
for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
298+
I != E; ++I) {
299+
// When a uniform loop is inside non-uniform control flow, the branch
300+
// leaving the loop might never be taken when EXEC = 0.
301+
// Hence we should retain cbranch out of the loop lest it become infinite.
302+
if (I->isConditionalBranch())
303+
return true;
304+
305+
if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
306+
return true;
307+
308+
// These instructions are potentially expensive even if EXEC = 0.
309+
if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
310+
TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
311+
return true;
312+
313+
++NumInstr;
314+
if (NumInstr >= SkipThreshold)
315+
return true;
316+
}
317+
}
318+
319+
return false;
320+
}
321+
322+
// Returns true if the skip branch instruction is removed.
323+
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
324+
MachineBasicBlock &SrcMBB) {
325+
MachineBasicBlock *TrueMBB = nullptr;
326+
MachineBasicBlock *FalseMBB = nullptr;
327+
SmallVector<MachineOperand, 1> Cond;
328+
329+
if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
330+
return false;
331+
332+
// Consider only the forward branches.
333+
if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
334+
mustRetainExeczBranch(*FalseMBB, *TrueMBB))
335+
return false;
336+
337+
LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
338+
MI.eraseFromParent();
339+
SrcMBB.removeSuccessor(TrueMBB);
340+
341+
return true;
342+
}
343+
261344
bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
262345
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
263346
TII = ST.getInstrInfo();
264347
TRI = &TII->getRegisterInfo();
265348
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
266349
bool Changed = false;
267350

351+
MF.RenumberBlocks();
352+
268353
for (MachineBasicBlock &MBB : MF) {
269354
MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
270355
MachineBasicBlock::iterator TermI = MBBE;
271-
// Check first terminator for VCC branches to optimize
356+
// Check first terminator for branches to optimize
272357
if (TermI != MBB.end()) {
273358
MachineInstr &MI = *TermI;
274359
switch (MI.getOpcode()) {
275360
case AMDGPU::S_CBRANCH_VCCZ:
276361
case AMDGPU::S_CBRANCH_VCCNZ:
277362
Changed |= optimizeVccBranch(MI);
278363
continue;
364+
case AMDGPU::S_CBRANCH_EXECZ:
365+
Changed |= removeExeczBranch(MI, MBB);
366+
continue;
279367
default:
280368
break;
281369
}

llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp

Lines changed: 0 additions & 159 deletions
This file was deleted.

0 commit comments

Comments
 (0)