Skip to content

Commit 45f3bf8

Browse files
committed
[AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: use BranchProbability and TargetSchedModel
Remove s_cbranch_execnz branches if the transformation is profitable according to BranchProbability and TargetSchedmodel.
1 parent eb9f369 commit 45f3bf8

21 files changed

+151
-226
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 65 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,13 @@
1515
#include "GCNSubtarget.h"
1616
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1717
#include "llvm/CodeGen/MachineFunctionPass.h"
18+
#include "llvm/CodeGen/TargetSchedule.h"
19+
#include "llvm/Support/BranchProbability.h"
1820

1921
using namespace llvm;
2022

2123
#define DEBUG_TYPE "si-pre-emit-peephole"
2224

23-
static unsigned SkipThreshold;
24-
25-
static cl::opt<unsigned, true> SkipThresholdFlag(
26-
"amdgpu-skip-threshold", cl::Hidden,
27-
cl::desc(
28-
"Number of instructions before jumping over divergent control flow"),
29-
cl::location(SkipThreshold), cl::init(12));
30-
3125
namespace {
3226

3327
class SIPreEmitPeephole : public MachineFunctionPass {
@@ -41,7 +35,8 @@ class SIPreEmitPeephole : public MachineFunctionPass {
4135
MachineBasicBlock *&TrueMBB,
4236
MachineBasicBlock *&FalseMBB,
4337
SmallVectorImpl<MachineOperand> &Cond);
44-
bool mustRetainExeczBranch(const MachineBasicBlock &From,
38+
bool mustRetainExeczBranch(const MachineBasicBlock &Head,
39+
const MachineBasicBlock &From,
4540
const MachineBasicBlock &To) const;
4641
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
4742

@@ -304,11 +299,58 @@ bool SIPreEmitPeephole::getBlockDestinations(
304299
return true;
305300
}
306301

302+
namespace {
303+
class BranchWeightCostModel {
304+
const SIInstrInfo &TII;
305+
const TargetSchedModel &SchedModel;
306+
BranchProbability BranchProb;
307+
uint64_t BranchCost;
308+
uint64_t ThenCyclesCost = 0;
309+
310+
public:
311+
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
312+
const MachineBasicBlock &Succ)
313+
: TII(TII), SchedModel(TII.getSchedModel()) {
314+
assert(SchedModel.hasInstrSchedModelOrItineraries());
315+
316+
const MachineBasicBlock &Head = *Branch.getParent();
317+
const auto *FromIt = find(Head.successors(), &Succ);
318+
assert(FromIt != Head.succ_end());
319+
320+
BranchProb = Head.getSuccProbability(FromIt);
321+
assert(!BranchProb.isUnknown());
322+
BranchCost = SchedModel.computeInstrLatency(&Branch, false);
323+
}
324+
325+
bool isProfitable(const MachineInstr &MI) {
326+
if (TII.isWaitcnt(MI.getOpcode()))
327+
return false;
328+
329+
ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);
330+
331+
// Consider `P = N/D` to be the probability of execz being true
332+
// The transformation is profitable if always executing the 'then' block
333+
// is cheaper than executing sometimes 'then' and always
334+
// executing s_cbranch_execz:
335+
// * ThenCost <= P*ThenCost + BranchCost
336+
// * (1-P) * ThenCost <= BranchCost
337+
// * (D-N)/D * ThenCost <= BranchCost
338+
uint64_t Numerator = BranchProb.getNumerator();
339+
uint64_t Denominator = BranchProb.getDenominator();
340+
return (Denominator - Numerator) * ThenCyclesCost <=
341+
Denominator * BranchCost;
342+
}
343+
};
344+
307345
bool SIPreEmitPeephole::mustRetainExeczBranch(
308-
const MachineBasicBlock &From, const MachineBasicBlock &To) const {
309-
unsigned NumInstr = 0;
310-
const MachineFunction *MF = From.getParent();
346+
const MachineBasicBlock &Head, const MachineBasicBlock &From,
347+
const MachineBasicBlock &To) const {
348+
349+
assert(is_contained(Head.successors(), &From));
350+
351+
BranchWeightCostModel CostModel{*TII, *Head.getFirstTerminator(), From};
311352

353+
const MachineFunction *MF = From.getParent();
312354
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
313355
MBBI != End && MBBI != ToI; ++MBBI) {
314356
const MachineBasicBlock &MBB = *MBBI;
@@ -326,23 +368,22 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
326368
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
327369
return true;
328370

329-
// These instructions are potentially expensive even if EXEC = 0.
330-
if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
331-
TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
332-
return true;
333-
334-
++NumInstr;
335-
if (NumInstr >= SkipThreshold)
371+
if (!CostModel.isProfitable(MI))
336372
return true;
337373
}
338374
}
339375

340376
return false;
341377
}
378+
} // namespace
342379

343380
// Returns true if the skip branch instruction is removed.
344381
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
345382
MachineBasicBlock &SrcMBB) {
383+
384+
if (!TII->getSchedModel().hasInstrSchedModelOrItineraries())
385+
return false;
386+
346387
MachineBasicBlock *TrueMBB = nullptr;
347388
MachineBasicBlock *FalseMBB = nullptr;
348389
SmallVector<MachineOperand, 1> Cond;
@@ -351,8 +392,11 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
351392
return false;
352393

353394
// Consider only the forward branches.
354-
if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
355-
mustRetainExeczBranch(*FalseMBB, *TrueMBB))
395+
if (SrcMBB.getNumber() >= TrueMBB->getNumber())
396+
return false;
397+
398+
// Consider only when it is legal and profitable
399+
if (mustRetainExeczBranch(SrcMBB, *FalseMBB, *TrueMBB))
356400
return false;
357401

358402
LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);

llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
292292
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293293
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0
294294
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
295-
; GFX9-NEXT: s_cbranch_execz .LBB5_2
296295
; GFX9-NEXT: ; %bb.1: ; %if.then
297296
; GFX9-NEXT: s_mov_b32 s11, s18
298297
; GFX9-NEXT: s_mov_b32 s10, s17
@@ -301,7 +300,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
301300
; GFX9-NEXT: v_mov_b32_e32 v0, s6
302301
; GFX9-NEXT: v_mov_b32_e32 v1, s19
303302
; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
304-
; GFX9-NEXT: .LBB5_2: ; %if.end
303+
; GFX9-NEXT: ; %bb.2: ; %if.end
305304
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
306305
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307306
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -311,7 +310,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
311310
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312311
; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v0
313312
; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
314-
; GFX1010-NEXT: s_cbranch_execz .LBB5_2
315313
; GFX1010-NEXT: ; %bb.1: ; %if.then
316314
; GFX1010-NEXT: v_mov_b32_e32 v0, s6
317315
; GFX1010-NEXT: v_mov_b32_e32 v1, s19
@@ -320,7 +318,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
320318
; GFX1010-NEXT: s_mov_b32 s9, s16
321319
; GFX1010-NEXT: s_mov_b32 s8, s7
322320
; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
323-
; GFX1010-NEXT: .LBB5_2: ; %if.end
321+
; GFX1010-NEXT: ; %bb.2: ; %if.end
324322
; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
325323
; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
326324
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -331,7 +329,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
331329
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332330
; GFX1030-NEXT: s_mov_b32 s4, exec_lo
333331
; GFX1030-NEXT: v_cmpx_lt_i32_e32 0, v0
334-
; GFX1030-NEXT: s_cbranch_execz .LBB5_2
335332
; GFX1030-NEXT: ; %bb.1: ; %if.then
336333
; GFX1030-NEXT: v_mov_b32_e32 v0, s6
337334
; GFX1030-NEXT: v_mov_b32_e32 v1, s19
@@ -340,7 +337,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
340337
; GFX1030-NEXT: s_mov_b32 s9, s16
341338
; GFX1030-NEXT: s_mov_b32 s8, s7
342339
; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
343-
; GFX1030-NEXT: .LBB5_2: ; %if.end
340+
; GFX1030-NEXT: ; %bb.2: ; %if.end
344341
; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
345342
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346343
; GFX1030-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/branch-condition-and.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,12 @@ define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
1717
; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1
1818
; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
1919
; GCN-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
20-
; GCN-NEXT: s_cbranch_execz .LBB0_2
2120
; GCN-NEXT: ; %bb.1: ; %bb4
2221
; GCN-NEXT: v_mov_b32_e32 v0, 4
2322
; GCN-NEXT: s_mov_b32 m0, -1
2423
; GCN-NEXT: ds_write_b32 v0, v0
2524
; GCN-NEXT: ; divergent unreachable
26-
; GCN-NEXT: .LBB0_2: ; %UnifiedReturnBlock
25+
; GCN-NEXT: ; %bb.2: ; %UnifiedReturnBlock
2726
; GCN-NEXT: s_endpgm
2827
bb:
2928
%tmp = fcmp ogt float %arg, 0.000000e+00

llvm/test/CodeGen/AMDGPU/else.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ end:
3030
; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]]
3131
; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]]
3232
; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]]
33-
; CHECK-NEXT: s_cbranch_execz
3433
define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 {
3534
main_body:
3635
%cc = icmp sgt i32 %z, 5

llvm/test/CodeGen/AMDGPU/fptoi.i128.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,7 +1744,6 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
17441744
; GISEL-NEXT: ; implicit-def: $vgpr9
17451745
; GISEL-NEXT: .LBB6_4: ; %Flow
17461746
; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17]
1747-
; GISEL-NEXT: s_cbranch_execz .LBB6_6
17481747
; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12
17491748
; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x86, v5
17501749
; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3
@@ -1758,7 +1757,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
17581757
; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v1
17591758
; GISEL-NEXT: v_mul_i32_i24_e32 v0, v0, v9
17601759
; GISEL-NEXT: v_mov_b32_e32 v3, v2
1761-
; GISEL-NEXT: .LBB6_6: ; %Flow1
1760+
; GISEL-NEXT: ; %bb.6: ; %Flow1
17621761
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
17631762
; GISEL-NEXT: .LBB6_7: ; %Flow2
17641763
; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15]
@@ -2095,7 +2094,6 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
20952094
; GISEL-NEXT: ; implicit-def: $vgpr9
20962095
; GISEL-NEXT: .LBB7_4: ; %Flow
20972096
; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17]
2098-
; GISEL-NEXT: s_cbranch_execz .LBB7_6
20992097
; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12
21002098
; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x86, v5
21012099
; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3
@@ -2109,7 +2107,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
21092107
; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v1
21102108
; GISEL-NEXT: v_mul_i32_i24_e32 v0, v0, v9
21112109
; GISEL-NEXT: v_mov_b32_e32 v3, v2
2112-
; GISEL-NEXT: .LBB7_6: ; %Flow1
2110+
; GISEL-NEXT: ; %bb.6: ; %Flow1
21132111
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
21142112
; GISEL-NEXT: .LBB7_7: ; %Flow2
21152113
; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15]

llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,12 @@ define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid)
3636
; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
3737
; SI-NEXT: s_mov_b64 s[8:9], -1
3838
; SI-NEXT: s_and_saveexec_b64 s[12:13], vcc
39+
; SI-NEXT: s_cbranch_execz .LBB0_6
3940
; SI-NEXT: ; %bb.5: ; %end.loop
4041
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
4142
; SI-NEXT: s_add_i32 s14, s14, 1
4243
; SI-NEXT: s_xor_b64 s[8:9], exec, -1
43-
; SI-NEXT: ; %bb.6: ; %Flow1
44+
; SI-NEXT: .LBB0_6: ; %Flow1
4445
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
4546
; SI-NEXT: s_or_b64 exec, exec, s[12:13]
4647
; SI-NEXT: s_branch .LBB0_2

llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@ name: skip_execz_flat
77
body: |
88
; CHECK-LABEL: name: skip_execz_flat
99
; CHECK: bb.0:
10-
; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001)
11-
; CHECK-NEXT: {{ $}}
12-
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
10+
; CHECK-NEXT: successors: %bb.1(0x7fffffff)
1311
; CHECK-NEXT: {{ $}}
1412
; CHECK-NEXT: bb.1:
1513
; CHECK-NEXT: successors: %bb.2(0x80000000)
@@ -38,9 +36,7 @@ name: skip_execz_mubuf
3836
body: |
3937
; CHECK-LABEL: name: skip_execz_mubuf
4038
; CHECK: bb.0:
41-
; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001)
42-
; CHECK-NEXT: {{ $}}
43-
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
39+
; CHECK-NEXT: successors: %bb.1(0x7fffffff)
4440
; CHECK-NEXT: {{ $}}
4541
; CHECK-NEXT: bb.1:
4642
; CHECK-NEXT: successors: %bb.2(0x80000000)
@@ -69,9 +65,7 @@ name: skip_execz_ds
6965
body: |
7066
; CHECK-LABEL: name: skip_execz_ds
7167
; CHECK: bb.0:
72-
; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001)
73-
; CHECK-NEXT: {{ $}}
74-
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
68+
; CHECK-NEXT: successors: %bb.1(0x7fffffff)
7569
; CHECK-NEXT: {{ $}}
7670
; CHECK-NEXT: bb.1:
7771
; CHECK-NEXT: successors: %bb.2(0x80000000)

llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir

Lines changed: 0 additions & 95 deletions
This file was deleted.

llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2-
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s
33

44
---
55
name: skip_waitcnt_vscnt

0 commit comments

Comments
 (0)