Skip to content

Commit fc56bb2

Browse files
committed
[AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: use a cost model
s_cbranch_execnz branches are removed if the transformation is profitable according to BranchProbability and the TargetSchedmodel
1 parent 3422bab commit fc56bb2

22 files changed

+303
-475
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
#include "GCNSubtarget.h"
1616
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1717
#include "llvm/CodeGen/MachineFunctionPass.h"
18+
#include "llvm/CodeGen/TargetSchedule.h"
19+
#include "llvm/Support/BranchProbability.h"
20+
21+
#include <limits>
1822

1923
using namespace llvm;
2024

@@ -41,7 +45,8 @@ class SIPreEmitPeephole : public MachineFunctionPass {
4145
MachineBasicBlock *&TrueMBB,
4246
MachineBasicBlock *&FalseMBB,
4347
SmallVectorImpl<MachineOperand> &Cond);
44-
bool mustRetainExeczBranch(const MachineBasicBlock &From,
48+
bool mustRetainExeczBranch(const MachineBasicBlock &Head,
49+
const MachineBasicBlock &From,
4550
const MachineBasicBlock &To) const;
4651
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
4752

@@ -305,10 +310,53 @@ bool SIPreEmitPeephole::getBlockDestinations(
305310
}
306311

307312
bool SIPreEmitPeephole::mustRetainExeczBranch(
308-
const MachineBasicBlock &From, const MachineBasicBlock &To) const {
313+
const MachineBasicBlock &Head, const MachineBasicBlock &From,
314+
const MachineBasicBlock &To) const {
315+
316+
auto FromIt = find(Head.successors(), &From);
317+
assert(FromIt != Head.succ_end());
318+
BranchProbability ExecNZProb = Head.getSuccProbability(FromIt);
319+
309320
unsigned NumInstr = 0;
310-
const MachineFunction *MF = From.getParent();
311321

322+
unsigned long ExecNZBranchCost = 0;
323+
unsigned long UnconditionalBranchCost = 0;
324+
unsigned long N = 0;
325+
unsigned long D = 0;
326+
unsigned long ThenCyclesCost = 0;
327+
328+
std::function<bool(const MachineInstr &)> IsProfitable =
329+
[&](const MachineInstr &MI) {
330+
++NumInstr;
331+
if (NumInstr >= SkipThreshold)
332+
return false;
333+
// These instructions are potentially expensive even if EXEC = 0.
334+
if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
335+
TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
336+
return false;
337+
return true;
338+
};
339+
340+
auto &SchedModel = TII->getSchedModel();
341+
if (SchedModel.hasInstrSchedModel() && !ExecNZProb.isUnknown()) {
342+
ExecNZBranchCost = SchedModel.computeInstrLatency(AMDGPU::S_CBRANCH_EXECZ);
343+
UnconditionalBranchCost = SchedModel.computeInstrLatency(AMDGPU::S_BRANCH);
344+
N = ExecNZProb.getNumerator();
345+
D = ExecNZProb.getDenominator();
346+
347+
IsProfitable = [&](const MachineInstr &MI) {
348+
ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);
349+
350+
// Consider `P = N/D` to be the probability of execnz being true
351+
// The transformation is profitable if always executing the 'then' block
352+
// is cheaper than executing sometimes 'then', s_branch and always
353+
// executing s_cbranch_execnz
354+
return (D - N) * ThenCyclesCost <=
355+
D * ExecNZBranchCost + (D - N) * UnconditionalBranchCost;
356+
};
357+
}
358+
359+
const MachineFunction *MF = From.getParent();
312360
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
313361
MBBI != End && MBBI != ToI; ++MBBI) {
314362
const MachineBasicBlock &MBB = *MBBI;
@@ -326,13 +374,7 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
326374
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
327375
return true;
328376

329-
// These instructions are potentially expensive even if EXEC = 0.
330-
if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
331-
TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
332-
return true;
333-
334-
++NumInstr;
335-
if (NumInstr >= SkipThreshold)
377+
if (!IsProfitable(MI))
336378
return true;
337379
}
338380
}
@@ -351,8 +393,11 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
351393
return false;
352394

353395
// Consider only the forward branches.
354-
if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
355-
mustRetainExeczBranch(*FalseMBB, *TrueMBB))
396+
if (SrcMBB.getNumber() >= TrueMBB->getNumber())
397+
return false;
398+
399+
// Consider only when it is legal and profitable
400+
if (mustRetainExeczBranch(SrcMBB, *FalseMBB, *TrueMBB))
356401
return false;
357402

358403
LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
@@ -366,6 +411,7 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
366411
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
367412
TII = ST.getInstrInfo();
368413
TRI = &TII->getRegisterInfo();
414+
369415
bool Changed = false;
370416

371417
MF.RenumberBlocks();

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1726,7 +1726,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
17261726
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
17271727
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
17281728
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
1729-
; GFX90A-NEXT: s_cbranch_execz .LBB59_2
17301729
; GFX90A-NEXT: ; %bb.1:
17311730
; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24
17321731
; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
@@ -1736,7 +1735,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
17361735
; GFX90A-NEXT: v_mov_b32_e32 v2, s2
17371736
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
17381737
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1739-
; GFX90A-NEXT: .LBB59_2:
1738+
; GFX90A-NEXT: ; %bb.2:
17401739
; GFX90A-NEXT: s_endpgm
17411740
;
17421741
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat:
@@ -1747,7 +1746,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
17471746
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
17481747
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
17491748
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
1750-
; GFX940-NEXT: s_cbranch_execz .LBB59_2
17511749
; GFX940-NEXT: ; %bb.1:
17521750
; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24
17531751
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
@@ -1757,7 +1755,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
17571755
; GFX940-NEXT: v_mov_b32_e32 v2, s2
17581756
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
17591757
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1760-
; GFX940-NEXT: .LBB59_2:
1758+
; GFX940-NEXT: ; %bb.2:
17611759
; GFX940-NEXT: s_endpgm
17621760
main_body:
17631761
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1773,7 +1771,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
17731771
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
17741772
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
17751773
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
1776-
; GFX90A-NEXT: s_cbranch_execz .LBB60_2
17771774
; GFX90A-NEXT: ; %bb.1:
17781775
; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24
17791776
; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
@@ -1783,7 +1780,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
17831780
; GFX90A-NEXT: v_mov_b32_e32 v2, s2
17841781
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
17851782
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1786-
; GFX90A-NEXT: .LBB60_2:
1783+
; GFX90A-NEXT: ; %bb.2:
17871784
; GFX90A-NEXT: s_endpgm
17881785
;
17891786
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush:
@@ -1794,7 +1791,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
17941791
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
17951792
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
17961793
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
1797-
; GFX940-NEXT: s_cbranch_execz .LBB60_2
17981794
; GFX940-NEXT: ; %bb.1:
17991795
; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24
18001796
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
@@ -1804,7 +1800,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
18041800
; GFX940-NEXT: v_mov_b32_e32 v2, s2
18051801
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
18061802
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1807-
; GFX940-NEXT: .LBB60_2:
1803+
; GFX940-NEXT: ; %bb.2:
18081804
; GFX940-NEXT: s_endpgm
18091805
main_body:
18101806
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1820,7 +1816,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
18201816
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
18211817
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
18221818
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
1823-
; GFX90A-NEXT: s_cbranch_execz .LBB61_2
18241819
; GFX90A-NEXT: ; %bb.1:
18251820
; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24
18261821
; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
@@ -1830,7 +1825,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
18301825
; GFX90A-NEXT: v_mov_b32_e32 v2, s2
18311826
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
18321827
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1833-
; GFX90A-NEXT: .LBB61_2:
1828+
; GFX90A-NEXT: ; %bb.2:
18341829
; GFX90A-NEXT: s_endpgm
18351830
;
18361831
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
@@ -1841,7 +1836,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
18411836
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
18421837
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
18431838
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
1844-
; GFX940-NEXT: s_cbranch_execz .LBB61_2
18451839
; GFX940-NEXT: ; %bb.1:
18461840
; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24
18471841
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
@@ -1851,7 +1845,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
18511845
; GFX940-NEXT: v_mov_b32_e32 v2, s2
18521846
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
18531847
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1854-
; GFX940-NEXT: .LBB61_2:
1848+
; GFX940-NEXT: ; %bb.2:
18551849
; GFX940-NEXT: s_endpgm
18561850
main_body:
18571851
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0

llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -526,21 +526,19 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
526526
; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3]
527527
; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo
528528
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
529-
; GFX10-NEXT: s_cbranch_execz .LBB10_2
530529
; GFX10-NEXT: ; %bb.1: ; %else
531530
; GFX10-NEXT: s_waitcnt vmcnt(0)
532531
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, v2, v4, 0
533532
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, v2, v5, v[1:2]
534533
; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3
535534
; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
536-
; GFX10-NEXT: .LBB10_2: ; %Flow
535+
; GFX10-NEXT: ; %bb.2: ; %Flow
537536
; GFX10-NEXT: s_andn2_saveexec_b32 s0, s0
538-
; GFX10-NEXT: s_cbranch_execz .LBB10_4
539537
; GFX10-NEXT: ; %bb.3: ; %if
540538
; GFX10-NEXT: s_waitcnt vmcnt(0)
541539
; GFX10-NEXT: v_mul_lo_u32 v1, v2, v5
542540
; GFX10-NEXT: v_mov_b32_e32 v0, 0
543-
; GFX10-NEXT: .LBB10_4: ; %endif
541+
; GFX10-NEXT: ; %bb.4: ; %endif
544542
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
545543
; GFX10-NEXT: v_mov_b32_e32 v2, 0
546544
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -563,7 +561,6 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
563561
; GFX11-NEXT: s_waitcnt vmcnt(1)
564562
; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3]
565563
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
566-
; GFX11-NEXT: s_cbranch_execz .LBB10_2
567564
; GFX11-NEXT: ; %bb.1: ; %else
568565
; GFX11-NEXT: s_waitcnt vmcnt(0)
569566
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
@@ -572,14 +569,13 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
572569
; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
573570
; GFX11-NEXT: v_mov_b32_e32 v1, v3
574571
; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
575-
; GFX11-NEXT: .LBB10_2: ; %Flow
572+
; GFX11-NEXT: ; %bb.2: ; %Flow
576573
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
577-
; GFX11-NEXT: s_cbranch_execz .LBB10_4
578574
; GFX11-NEXT: ; %bb.3: ; %if
579575
; GFX11-NEXT: s_waitcnt vmcnt(0)
580576
; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5
581577
; GFX11-NEXT: v_mov_b32_e32 v0, 0
582-
; GFX11-NEXT: .LBB10_4: ; %endif
578+
; GFX11-NEXT: ; %bb.4: ; %endif
583579
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
584580
; GFX11-NEXT: v_mov_b32_e32 v2, 0
585581
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]

llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,6 @@ define void @divergent_cmp_profitable(i32 noundef inreg %value, ptr addrspace(8)
209209
; GFX1010-NEXT: v_and_b32_e32 v0, 0x3ff, v31
210210
; GFX1010-NEXT: v_cmp_gt_i32_e32 vcc_lo, s21, v0
211211
; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
212-
; GFX1010-NEXT: s_cbranch_execz .LBB5_2
213212
; GFX1010-NEXT: ; %bb.1: ; %if.then
214213
; GFX1010-NEXT: v_mov_b32_e32 v0, s6
215214
; GFX1010-NEXT: v_mov_b32_e32 v1, s19
@@ -218,7 +217,7 @@ define void @divergent_cmp_profitable(i32 noundef inreg %value, ptr addrspace(8)
218217
; GFX1010-NEXT: s_mov_b32 s9, s16
219218
; GFX1010-NEXT: s_mov_b32 s8, s7
220219
; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
221-
; GFX1010-NEXT: .LBB5_2: ; %if.end
220+
; GFX1010-NEXT: ; %bb.2: ; %if.end
222221
; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
223222
; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
224223
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -230,7 +229,6 @@ define void @divergent_cmp_profitable(i32 noundef inreg %value, ptr addrspace(8)
230229
; GFX1030-NEXT: v_and_b32_e32 v0, 0x3ff, v31
231230
; GFX1030-NEXT: s_mov_b32 s4, exec_lo
232231
; GFX1030-NEXT: v_cmpx_gt_i32_e64 s21, v0
233-
; GFX1030-NEXT: s_cbranch_execz .LBB5_2
234232
; GFX1030-NEXT: ; %bb.1: ; %if.then
235233
; GFX1030-NEXT: v_mov_b32_e32 v0, s6
236234
; GFX1030-NEXT: v_mov_b32_e32 v1, s19
@@ -239,7 +237,7 @@ define void @divergent_cmp_profitable(i32 noundef inreg %value, ptr addrspace(8)
239237
; GFX1030-NEXT: s_mov_b32 s9, s16
240238
; GFX1030-NEXT: s_mov_b32 s8, s7
241239
; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
242-
; GFX1030-NEXT: .LBB5_2: ; %if.end
240+
; GFX1030-NEXT: ; %bb.2: ; %if.end
243241
; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
244242
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245243
; GFX1030-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)