Skip to content

Commit 4610dfa

Browse files
committed
[AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: use BranchProbability and TargetSchedModel
Remove s_cbranch_execnz branches if the transformation is profitable according to BranchProbability and TargetSchedmodel.
1 parent 816134b commit 4610dfa

20 files changed

+187
-230
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 74 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,13 @@
1515
#include "GCNSubtarget.h"
1616
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1717
#include "llvm/CodeGen/MachineFunctionPass.h"
18+
#include "llvm/CodeGen/TargetSchedule.h"
19+
#include "llvm/Support/BranchProbability.h"
1820

1921
using namespace llvm;
2022

2123
#define DEBUG_TYPE "si-pre-emit-peephole"
2224

23-
static unsigned SkipThreshold;
24-
25-
static cl::opt<unsigned, true> SkipThresholdFlag(
26-
"amdgpu-skip-threshold", cl::Hidden,
27-
cl::desc(
28-
"Number of instructions before jumping over divergent control flow"),
29-
cl::location(SkipThreshold), cl::init(12));
30-
3125
namespace {
3226

3327
class SIPreEmitPeephole : public MachineFunctionPass {
@@ -41,7 +35,8 @@ class SIPreEmitPeephole : public MachineFunctionPass {
4135
MachineBasicBlock *&TrueMBB,
4236
MachineBasicBlock *&FalseMBB,
4337
SmallVectorImpl<MachineOperand> &Cond);
44-
bool mustRetainExeczBranch(const MachineBasicBlock &From,
38+
bool mustRetainExeczBranch(const MachineBasicBlock &Head,
39+
const MachineBasicBlock &From,
4540
const MachineBasicBlock &To) const;
4641
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
4742

@@ -304,11 +299,67 @@ bool SIPreEmitPeephole::getBlockDestinations(
304299
return true;
305300
}
306301

302+
namespace {
303+
class BranchWeightCostModel {
304+
const SIInstrInfo &TII;
305+
const TargetSchedModel &SchedModel;
306+
BranchProbability BranchProb;
307+
uint64_t BranchCost;
308+
uint64_t ThenCyclesCost = 0;
309+
310+
public:
311+
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
312+
const MachineBasicBlock &Succ)
313+
: TII(TII), SchedModel(TII.getSchedModel()) {
314+
assert(SchedModel.hasInstrSchedModelOrItineraries());
315+
316+
const MachineBasicBlock &Head = *Branch.getParent();
317+
const auto *FromIt = find(Head.successors(), &Succ);
318+
assert(FromIt != Head.succ_end());
319+
320+
BranchProb = Head.getSuccProbability(FromIt);
321+
if (BranchProb.isUnknown())
322+
return;
323+
324+
BranchCost = SchedModel.computeInstrLatency(&Branch, false);
325+
}
326+
327+
bool isUnknown() const { return BranchProb.isUnknown(); }
328+
329+
bool isProfitable(const MachineInstr &MI) {
330+
assert(!isUnknown());
331+
332+
if (TII.isWaitcnt(MI.getOpcode()))
333+
return false;
334+
335+
ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);
336+
337+
// Consider `P = N/D` to be the probability of execnz being true
338+
// The transformation is profitable if always executing the 'then' block
339+
// is cheaper than executing sometimes 'then' and always
340+
// executing s_cbranch_execnz:
341+
// * ThenCost <= P*ThenCost + BranchCost
342+
// * (1-P) * ThenCost <= BranchCost
343+
// * (D-N)/D * ThenCost <= BranchCost
344+
uint64_t Numerator = BranchProb.getNumerator();
345+
uint64_t Denominator = BranchProb.getDenominator();
346+
return (Denominator - Numerator) * ThenCyclesCost <=
347+
Denominator * BranchCost;
348+
}
349+
};
350+
307351
bool SIPreEmitPeephole::mustRetainExeczBranch(
308-
const MachineBasicBlock &From, const MachineBasicBlock &To) const {
309-
unsigned NumInstr = 0;
310-
const MachineFunction *MF = From.getParent();
352+
const MachineBasicBlock &Head, const MachineBasicBlock &From,
353+
const MachineBasicBlock &To) const {
354+
355+
const auto *FromIt = find(Head.successors(), &From);
356+
assert(FromIt != Head.succ_end());
311357

358+
BranchWeightCostModel CostModel{*TII, *Head.getFirstTerminator(), From};
359+
if (CostModel.isUnknown())
360+
return true;
361+
362+
const MachineFunction *MF = From.getParent();
312363
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
313364
MBBI != End && MBBI != ToI; ++MBBI) {
314365
const MachineBasicBlock &MBB = *MBBI;
@@ -326,23 +377,22 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
326377
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
327378
return true;
328379

329-
// These instructions are potentially expensive even if EXEC = 0.
330-
if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
331-
TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
332-
return true;
333-
334-
++NumInstr;
335-
if (NumInstr >= SkipThreshold)
380+
if (!CostModel.isProfitable(MI))
336381
return true;
337382
}
338383
}
339384

340385
return false;
341386
}
387+
} // namespace
342388

343389
// Returns true if the skip branch instruction is removed.
344390
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
345391
MachineBasicBlock &SrcMBB) {
392+
393+
if (!TII->getSchedModel().hasInstrSchedModelOrItineraries())
394+
return false;
395+
346396
MachineBasicBlock *TrueMBB = nullptr;
347397
MachineBasicBlock *FalseMBB = nullptr;
348398
SmallVector<MachineOperand, 1> Cond;
@@ -351,8 +401,11 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
351401
return false;
352402

353403
// Consider only the forward branches.
354-
if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
355-
mustRetainExeczBranch(*FalseMBB, *TrueMBB))
404+
if (SrcMBB.getNumber() >= TrueMBB->getNumber())
405+
return false;
406+
407+
// Consider only when it is legal and profitable
408+
if (mustRetainExeczBranch(SrcMBB, *FalseMBB, *TrueMBB))
356409
return false;
357410

358411
LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);

llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
292292
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293293
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0
294294
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
295-
; GFX9-NEXT: s_cbranch_execz .LBB5_2
296295
; GFX9-NEXT: ; %bb.1: ; %if.then
297296
; GFX9-NEXT: s_mov_b32 s11, s18
298297
; GFX9-NEXT: s_mov_b32 s10, s17
@@ -301,7 +300,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
301300
; GFX9-NEXT: v_mov_b32_e32 v0, s6
302301
; GFX9-NEXT: v_mov_b32_e32 v1, s19
303302
; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
304-
; GFX9-NEXT: .LBB5_2: ; %if.end
303+
; GFX9-NEXT: ; %bb.2: ; %if.end
305304
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
306305
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307306
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -311,7 +310,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
311310
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312311
; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v0
313312
; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
314-
; GFX1010-NEXT: s_cbranch_execz .LBB5_2
315313
; GFX1010-NEXT: ; %bb.1: ; %if.then
316314
; GFX1010-NEXT: v_mov_b32_e32 v0, s6
317315
; GFX1010-NEXT: v_mov_b32_e32 v1, s19
@@ -320,7 +318,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
320318
; GFX1010-NEXT: s_mov_b32 s9, s16
321319
; GFX1010-NEXT: s_mov_b32 s8, s7
322320
; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
323-
; GFX1010-NEXT: .LBB5_2: ; %if.end
321+
; GFX1010-NEXT: ; %bb.2: ; %if.end
324322
; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
325323
; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
326324
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -331,7 +329,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
331329
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332330
; GFX1030-NEXT: s_mov_b32 s4, exec_lo
333331
; GFX1030-NEXT: v_cmpx_lt_i32_e32 0, v0
334-
; GFX1030-NEXT: s_cbranch_execz .LBB5_2
335332
; GFX1030-NEXT: ; %bb.1: ; %if.then
336333
; GFX1030-NEXT: v_mov_b32_e32 v0, s6
337334
; GFX1030-NEXT: v_mov_b32_e32 v1, s19
@@ -340,7 +337,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
340337
; GFX1030-NEXT: s_mov_b32 s9, s16
341338
; GFX1030-NEXT: s_mov_b32 s8, s7
342339
; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
343-
; GFX1030-NEXT: .LBB5_2: ; %if.end
340+
; GFX1030-NEXT: ; %bb.2: ; %if.end
344341
; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
345342
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346343
; GFX1030-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/branch-condition-and.ll

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
2-
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN-NO-FLAT %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN-FLAT %s
34

45
; This used to crash because during intermediate control flow lowering, there
56
; was a sequence
@@ -9,20 +10,35 @@
910
; s_mov_b64_term exec, s[2:3]
1011
; that was not treated correctly.
1112
;
12-
; GCN-LABEL: {{^}}ham:
13-
; GCN-DAG: v_cmp_lt_f32_e64 [[OTHERCC:s\[[0-9]+:[0-9]+\]]],
14-
; GCN-DAG: v_cmp_lt_f32_e32 vcc,
15-
; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
16-
; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
17-
; GCN-NEXT: s_cbranch_execz .LBB0_{{[0-9]+}}
18-
19-
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %bb4
20-
; GCN: ds_write_b32
21-
22-
; GCN: .LBB0_{{[0-9]+}}: ; %UnifiedReturnBlock
23-
; GCN-NEXT: s_endpgm
24-
; GCN-NEXT: .Lfunc_end
2513
define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
14+
; GCN-NO-FLAT-LABEL: ham:
15+
; GCN-NO-FLAT: ; %bb.0: ; %bb
16+
; GCN-NO-FLAT-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
17+
; GCN-NO-FLAT-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1
18+
; GCN-NO-FLAT-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
19+
; GCN-NO-FLAT-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
20+
; GCN-NO-FLAT-NEXT: s_cbranch_execz .LBB0_2
21+
; GCN-NO-FLAT-NEXT: ; %bb.1: ; %bb4
22+
; GCN-NO-FLAT-NEXT: v_mov_b32_e32 v0, 4
23+
; GCN-NO-FLAT-NEXT: s_mov_b32 m0, -1
24+
; GCN-NO-FLAT-NEXT: ds_write_b32 v0, v0
25+
; GCN-NO-FLAT-NEXT: ; divergent unreachable
26+
; GCN-NO-FLAT-NEXT: .LBB0_2: ; %UnifiedReturnBlock
27+
; GCN-NO-FLAT-NEXT: s_endpgm
28+
;
29+
; GCN-FLAT-LABEL: ham:
30+
; GCN-FLAT: ; %bb.0: ; %bb
31+
; GCN-FLAT-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
32+
; GCN-FLAT-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1
33+
; GCN-FLAT-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
34+
; GCN-FLAT-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
35+
; GCN-FLAT-NEXT: ; %bb.1: ; %bb4
36+
; GCN-FLAT-NEXT: v_mov_b32_e32 v0, 4
37+
; GCN-FLAT-NEXT: s_mov_b32 m0, -1
38+
; GCN-FLAT-NEXT: ds_write_b32 v0, v0
39+
; GCN-FLAT-NEXT: ; divergent unreachable
40+
; GCN-FLAT-NEXT: ; %bb.2: ; %UnifiedReturnBlock
41+
; GCN-FLAT-NEXT: s_endpgm
2642
bb:
2743
%tmp = fcmp ogt float %arg, 0.000000e+00
2844
%tmp2 = fcmp ogt float %arg1, 0.000000e+00

llvm/test/CodeGen/AMDGPU/else.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ end:
3030
; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]]
3131
; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]]
3232
; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]]
33-
; CHECK-NEXT: s_cbranch_execz
3433
define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 {
3534
main_body:
3635
%cc = icmp sgt i32 %z, 5

llvm/test/CodeGen/AMDGPU/fptoi.i128.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,7 +1744,6 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
17441744
; GISEL-NEXT: ; implicit-def: $vgpr9
17451745
; GISEL-NEXT: .LBB6_4: ; %Flow
17461746
; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17]
1747-
; GISEL-NEXT: s_cbranch_execz .LBB6_6
17481747
; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12
17491748
; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x86, v5
17501749
; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3
@@ -1758,7 +1757,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
17581757
; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v1
17591758
; GISEL-NEXT: v_mul_i32_i24_e32 v0, v0, v9
17601759
; GISEL-NEXT: v_mov_b32_e32 v3, v2
1761-
; GISEL-NEXT: .LBB6_6: ; %Flow1
1760+
; GISEL-NEXT: ; %bb.6: ; %Flow1
17621761
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
17631762
; GISEL-NEXT: .LBB6_7: ; %Flow2
17641763
; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15]
@@ -2095,7 +2094,6 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
20952094
; GISEL-NEXT: ; implicit-def: $vgpr9
20962095
; GISEL-NEXT: .LBB7_4: ; %Flow
20972096
; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17]
2098-
; GISEL-NEXT: s_cbranch_execz .LBB7_6
20992097
; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12
21002098
; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x86, v5
21012099
; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3
@@ -2109,7 +2107,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
21092107
; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v1
21102108
; GISEL-NEXT: v_mul_i32_i24_e32 v0, v0, v9
21112109
; GISEL-NEXT: v_mov_b32_e32 v3, v2
2112-
; GISEL-NEXT: .LBB7_6: ; %Flow1
2110+
; GISEL-NEXT: ; %bb.6: ; %Flow1
21132111
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
21142112
; GISEL-NEXT: .LBB7_7: ; %Flow2
21152113
; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15]

llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,12 @@ define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid)
3636
; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
3737
; SI-NEXT: s_mov_b64 s[8:9], -1
3838
; SI-NEXT: s_and_saveexec_b64 s[12:13], vcc
39+
; SI-NEXT: s_cbranch_execz .LBB0_6
3940
; SI-NEXT: ; %bb.5: ; %end.loop
4041
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
4142
; SI-NEXT: s_add_i32 s14, s14, 1
4243
; SI-NEXT: s_xor_b64 s[8:9], exec, -1
43-
; SI-NEXT: ; %bb.6: ; %Flow1
44+
; SI-NEXT: .LBB0_6: ; %Flow1
4445
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
4546
; SI-NEXT: s_or_b64 exec, exec, s[12:13]
4647
; SI-NEXT: s_branch .LBB0_2

0 commit comments

Comments
 (0)