Skip to content

Commit dccd9fe

Browse files
committed
[AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: estimate ThenBlock cost using MachineTraceInfo
1 parent 72e6e20 commit dccd9fe

20 files changed

+185
-258
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 57 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
#include "GCNSubtarget.h"
1616
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1717
#include "llvm/CodeGen/MachineFunctionPass.h"
18+
#include "llvm/CodeGen/MachineInstr.h"
19+
#include "llvm/CodeGen/MachineTraceMetrics.h"
1820
#include "llvm/CodeGen/TargetSchedule.h"
21+
#include "llvm/InitializePasses.h"
1922
#include "llvm/Support/BranchProbability.h"
2023

2124
using namespace llvm;
@@ -29,6 +32,13 @@ class SIPreEmitPeephole : public MachineFunctionPass {
2932
const SIInstrInfo *TII = nullptr;
3033
const SIRegisterInfo *TRI = nullptr;
3134

35+
// Trace metrics analysis result, used to estimate the number of cycles it
36+
// takes to execute a block. For simplicity, initialized with TS_Local
37+
// strategy for the traces to have a single block. Then, getCriticalPath and
38+
// getResourceDepth give the results for a single block (instead of for a
39+
// whole trace).
40+
MachineTraceMetrics::Ensemble *Traces;
41+
3242
bool optimizeVccBranch(MachineInstr &MI) const;
3343
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
3444
bool getBlockDestinations(MachineBasicBlock &SrcMBB,
@@ -37,9 +47,14 @@ class SIPreEmitPeephole : public MachineFunctionPass {
3747
SmallVectorImpl<MachineOperand> &Cond);
3848
bool mustRetainExeczBranch(const MachineInstr &Branch,
3949
const MachineBasicBlock &From,
40-
const MachineBasicBlock &To) const;
50+
const MachineBasicBlock &To);
4151
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
4252

53+
void getAnalysisUsage(AnalysisUsage &AU) const override {
54+
AU.addRequired<MachineTraceMetrics>();
55+
MachineFunctionPass::getAnalysisUsage(AU);
56+
}
57+
4358
public:
4459
static char ID;
4560

@@ -52,8 +67,11 @@ class SIPreEmitPeephole : public MachineFunctionPass {
5267

5368
} // End anonymous namespace.
5469

55-
INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
56-
"SI peephole optimizations", false, false)
70+
INITIALIZE_PASS_BEGIN(SIPreEmitPeephole, DEBUG_TYPE,
71+
"SI peephole optimizations", false, false)
72+
INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
73+
INITIALIZE_PASS_END(SIPreEmitPeephole, DEBUG_TYPE, "SI peephole optimizations",
74+
false, false)
5775

5876
char SIPreEmitPeephole::ID = 0;
5977

@@ -299,60 +317,23 @@ bool SIPreEmitPeephole::getBlockDestinations(
299317
return true;
300318
}
301319

302-
namespace {
303-
class BranchWeightCostModel {
304-
const SIInstrInfo &TII;
305-
const TargetSchedModel &SchedModel;
306-
BranchProbability BranchProb;
307-
static constexpr uint64_t BranchNotTakenCost = 1;
308-
uint64_t BranchTakenCost;
309-
uint64_t ThenCyclesCost = 0;
310-
311-
public:
312-
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
313-
const MachineBasicBlock &Succ)
314-
: TII(TII), SchedModel(TII.getSchedModel()) {
315-
const MachineBasicBlock &Head = *Branch.getParent();
316-
const auto *FromIt = find(Head.successors(), &Succ);
317-
assert(FromIt != Head.succ_end());
318-
319-
BranchProb = Head.getSuccProbability(FromIt);
320-
if (BranchProb.isUnknown())
321-
BranchProb = BranchProbability::getZero();
322-
BranchTakenCost = SchedModel.computeInstrLatency(&Branch, false);
323-
}
324-
325-
bool isProfitable(const MachineInstr &MI) {
326-
if (TII.isWaitcnt(MI.getOpcode()))
327-
return false;
328-
329-
ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);
330-
331-
// Consider `P = N/D` to be the probability of execz being false (skipping
332-
// the then-block) The transformation is profitable if always executing the
333-
// 'then' block is cheaper than executing sometimes 'then' and always
334-
// executing s_cbranch_execz:
335-
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
336-
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
337-
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
338-
// BranchNotTakenCost
339-
uint64_t Numerator = BranchProb.getNumerator();
340-
uint64_t Denominator = BranchProb.getDenominator();
341-
return (Denominator - Numerator) * ThenCyclesCost <=
342-
((Denominator - Numerator) * BranchTakenCost +
343-
Numerator * BranchNotTakenCost);
344-
}
345-
};
346-
347-
bool SIPreEmitPeephole::mustRetainExeczBranch(
348-
const MachineInstr &Branch, const MachineBasicBlock &From,
349-
const MachineBasicBlock &To) const {
320+
bool SIPreEmitPeephole::mustRetainExeczBranch(const MachineInstr &Branch,
321+
const MachineBasicBlock &From,
322+
const MachineBasicBlock &To) {
350323

351324
const MachineBasicBlock &Head = *Branch.getParent();
352-
assert(is_contained(Head.successors(), &From));
325+
const auto *FromIt = find(Head.successors(), &From);
326+
assert(FromIt != Head.succ_end());
327+
328+
auto BranchProb = Head.getSuccProbability(FromIt);
329+
if (BranchProb.isUnknown())
330+
return false;
353331

354-
BranchWeightCostModel CostModel{*TII, Branch, From};
332+
uint64_t BranchTakenCost =
333+
TII->getSchedModel().computeInstrLatency(&Branch, false);
334+
constexpr uint64_t BranchNotTakenCost = 1;
355335

336+
unsigned ThenCyclesCost = 0;
356337
const MachineFunction *MF = From.getParent();
357338
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
358339
MBBI != End && MBBI != ToI; ++MBBI) {
@@ -371,14 +352,33 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
371352
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
372353
return true;
373354

374-
if (!CostModel.isProfitable(MI))
355+
if (TII->isWaitcnt(MI.getOpcode()))
375356
return true;
376357
}
358+
359+
MachineTraceMetrics::Trace Trace = Traces->getTrace(&From);
360+
ThenCyclesCost +=
361+
std::max(Trace.getCriticalPath(), Trace.getResourceDepth(true));
362+
363+
// Consider `P = N/D` to be the probability of execz being false (skipping
364+
// the then-block) The transformation is profitable if always executing the
365+
// 'then' block is cheaper than executing sometimes 'then' and always
366+
// executing s_cbranch_execz:
367+
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
368+
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
369+
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
370+
// BranchNotTakenCost
371+
uint64_t Numerator = BranchProb.getNumerator();
372+
uint64_t Denominator = BranchProb.getDenominator();
373+
bool IsProfitable = (Denominator - Numerator) * ThenCyclesCost <=
374+
((Denominator - Numerator) * BranchTakenCost +
375+
Numerator * BranchNotTakenCost);
376+
if (!IsProfitable)
377+
return true;
377378
}
378379

379380
return false;
380381
}
381-
} // namespace
382382

383383
// Returns true if the skip branch instruction is removed.
384384
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
@@ -413,6 +413,8 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
413413
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
414414
TII = ST.getInstrInfo();
415415
TRI = &TII->getRegisterInfo();
416+
Traces = getAnalysis<MachineTraceMetrics>().getEnsemble(
417+
llvm::MachineTraceStrategy::TS_Local);
416418
bool Changed = false;
417419

418420
MF.RenumberBlocks();

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -249,11 +249,10 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
249249
; GFX10-NEXT: .LBB3_6: ; %Flow1
250250
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
251251
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
252-
; GFX10-NEXT: s_cbranch_execz .LBB3_8
253252
; GFX10-NEXT: ; %bb.7: ; %block.after.loop
254253
; GFX10-NEXT: v_mov_b32_e32 v0, 5
255254
; GFX10-NEXT: flat_store_dword v[3:4], v0
256-
; GFX10-NEXT: .LBB3_8: ; %exit
255+
; GFX10-NEXT: ; %bb.8: ; %exit
257256
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
258257
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
259258
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -315,15 +314,14 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
315314
; GFX10-NEXT: v_mov_b32_e32 v4, v5
316315
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
317316
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
318-
; GFX10-NEXT: s_cbranch_execz .LBB4_4
319317
; GFX10-NEXT: ; %bb.3: ; %if.block.0
320318
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
321319
; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4
322320
; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[4:5]
323321
; GFX10-NEXT: v_add_co_u32 v8, s4, v2, v8
324322
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v3, v9, s4
325323
; GFX10-NEXT: global_store_dword v[8:9], v4, off
326-
; GFX10-NEXT: .LBB4_4: ; %loop.break.block
324+
; GFX10-NEXT: ; %bb.4: ; %loop.break.block
327325
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
328326
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
329327
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
@@ -342,10 +340,9 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
342340
; GFX10-NEXT: .LBB4_6: ; %cond.block.1
343341
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
344342
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
345-
; GFX10-NEXT: s_cbranch_execz .LBB4_8
346343
; GFX10-NEXT: ; %bb.7: ; %if.block.1
347344
; GFX10-NEXT: global_store_dword v[6:7], v4, off
348-
; GFX10-NEXT: .LBB4_8: ; %exit
345+
; GFX10-NEXT: ; %bb.8: ; %exit
349346
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
350347
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
351348
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -536,11 +533,10 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
536533
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
537534
; GFX10-NEXT: s_and_saveexec_b32 s0, s1
538535
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
539-
; GFX10-NEXT: s_cbranch_execz .LBB6_6
540536
; GFX10-NEXT: ; %bb.5: ; %break.body
541537
; GFX10-NEXT: v_mov_b32_e32 v0, 10
542538
; GFX10-NEXT: global_store_dword v[4:5], v0, off
543-
; GFX10-NEXT: .LBB6_6: ; %exit
539+
; GFX10-NEXT: ; %bb.6: ; %exit
544540
; GFX10-NEXT: s_endpgm
545541
entry:
546542
br label %A

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -437,11 +437,10 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad
437437
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
438438
; GFX10-NEXT: s_and_saveexec_b32 s0, s1
439439
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
440-
; GFX10-NEXT: s_cbranch_execz .LBB5_6
441440
; GFX10-NEXT: ; %bb.5: ; %break.body
442441
; GFX10-NEXT: v_mov_b32_e32 v0, 10
443442
; GFX10-NEXT: global_store_dword v[4:5], v0, off
444-
; GFX10-NEXT: .LBB5_6: ; %exit
443+
; GFX10-NEXT: ; %bb.6: ; %exit
445444
; GFX10-NEXT: s_endpgm
446445
entry:
447446
br label %A

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,12 +152,11 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
152152
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
153153
; GFX10-NEXT: s_and_saveexec_b32 s1, s0
154154
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
155-
; GFX10-NEXT: s_cbranch_execz .LBB2_7
156155
; GFX10-NEXT: ; %bb.6: ; %break.body
157156
; GFX10-NEXT: v_mov_b32_e32 v0, 10
158157
; GFX10-NEXT: v_mov_b32_e32 v1, 0
159158
; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
160-
; GFX10-NEXT: .LBB2_7: ; %exit
159+
; GFX10-NEXT: ; %bb.7: ; %exit
161160
; GFX10-NEXT: s_endpgm
162161
entry:
163162
br label %A

llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,9 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
6868
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
6969
; GFX906-NEXT: global_load_dword v1, v2, s[4:5]
7070
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
71-
; GFX906-NEXT: s_cbranch_execz .LBB1_2
7271
; GFX906-NEXT: ; %bb.1: ; %bb.1
7372
; GFX906-NEXT: global_load_dword v1, v2, s[6:7]
74-
; GFX906-NEXT: .LBB1_2: ; %bb.2
73+
; GFX906-NEXT: ; %bb.2: ; %bb.2
7574
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
7675
; GFX906-NEXT: v_mov_b32_e32 v0, 0
7776
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -149,10 +148,9 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
149148
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
150149
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
151150
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
152-
; GFX906-NEXT: s_cbranch_execz .LBB3_2
153151
; GFX906-NEXT: ; %bb.1: ; %bb.1
154152
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
155-
; GFX906-NEXT: .LBB3_2: ; %bb.2
153+
; GFX906-NEXT: ; %bb.2: ; %bb.2
156154
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
157155
; GFX906-NEXT: v_mov_b32_e32 v0, 0
158156
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -185,10 +183,9 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
185183
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
186184
; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5]
187185
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
188-
; GFX906-NEXT: s_cbranch_execz .LBB4_2
189186
; GFX906-NEXT: ; %bb.1: ; %bb.1
190187
; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7]
191-
; GFX906-NEXT: .LBB4_2: ; %bb.2
188+
; GFX906-NEXT: ; %bb.2: ; %bb.2
192189
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
193190
; GFX906-NEXT: v_mov_b32_e32 v0, 0
194191
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -222,11 +219,10 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
222219
; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5]
223220
; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16
224221
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
225-
; GFX906-NEXT: s_cbranch_execz .LBB5_2
226222
; GFX906-NEXT: ; %bb.1: ; %bb.1
227223
; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7]
228224
; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16
229-
; GFX906-NEXT: .LBB5_2: ; %bb.2
225+
; GFX906-NEXT: ; %bb.2: ; %bb.2
230226
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
231227
; GFX906-NEXT: v_mov_b32_e32 v0, 0
232228
; GFX906-NEXT: s_waitcnt vmcnt(1)
@@ -486,14 +482,13 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
486482
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
487483
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
488484
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
489-
; GFX906-NEXT: s_cbranch_execz .LBB8_2
490485
; GFX906-NEXT: ; %bb.1: ; %bb.1
491486
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
492487
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
493488
; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
494489
; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc
495490
; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
496-
; GFX906-NEXT: .LBB8_2: ; %Flow
491+
; GFX906-NEXT: ; %bb.2: ; %Flow
497492
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
498493
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
499494
; GFX906-NEXT: s_cbranch_execz .LBB8_4
@@ -547,11 +542,10 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
547542
; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[6:7]
548543
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
549544
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
550-
; GFX906-NEXT: s_cbranch_execz .LBB9_3
551545
; GFX906-NEXT: ; %bb.2: ; %bb.2
552546
; GFX906-NEXT: v_mov_b32_e32 v0, 0
553547
; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9]
554-
; GFX906-NEXT: .LBB9_3: ; %Flow
548+
; GFX906-NEXT: ; %bb.3: ; %Flow
555549
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
556550
; GFX906-NEXT: .LBB9_4: ; %bb.3
557551
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]

0 commit comments

Comments
 (0)