Skip to content

Commit eacad95

Browse files
committed
[AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: estimate ThenBlock cost using MachineTraceInfo
1 parent 383fc49 commit eacad95

20 files changed

+181
-259
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 53 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
#include "GCNSubtarget.h"
1616
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1717
#include "llvm/CodeGen/MachineFunctionPass.h"
18+
#include "llvm/CodeGen/MachineInstr.h"
19+
#include "llvm/CodeGen/MachineTraceMetrics.h"
1820
#include "llvm/CodeGen/TargetSchedule.h"
21+
#include "llvm/InitializePasses.h"
1922
#include "llvm/Support/BranchProbability.h"
2023

2124
using namespace llvm;
@@ -28,6 +31,8 @@ class SIPreEmitPeephole : public MachineFunctionPass {
2831
private:
2932
const SIInstrInfo *TII = nullptr;
3033
const SIRegisterInfo *TRI = nullptr;
34+
MachineTraceMetrics *Traces = nullptr;
35+
MachineTraceMetrics::Ensemble *MinInstr;
3136

3237
bool optimizeVccBranch(MachineInstr &MI) const;
3338
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
@@ -37,9 +42,14 @@ class SIPreEmitPeephole : public MachineFunctionPass {
3742
SmallVectorImpl<MachineOperand> &Cond);
3843
bool mustRetainExeczBranch(const MachineInstr &Branch,
3944
const MachineBasicBlock &From,
40-
const MachineBasicBlock &To) const;
45+
const MachineBasicBlock &To);
4146
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
4247

48+
void getAnalysisUsage(AnalysisUsage &AU) const override {
49+
AU.addRequired<MachineTraceMetrics>();
50+
MachineFunctionPass::getAnalysisUsage(AU);
51+
}
52+
4353
public:
4454
static char ID;
4555

@@ -52,8 +62,11 @@ class SIPreEmitPeephole : public MachineFunctionPass {
5262

5363
} // End anonymous namespace.
5464

55-
INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
56-
"SI peephole optimizations", false, false)
65+
INITIALIZE_PASS_BEGIN(SIPreEmitPeephole, DEBUG_TYPE,
66+
"SI peephole optimizations", false, false)
67+
INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
68+
INITIALIZE_PASS_END(SIPreEmitPeephole, DEBUG_TYPE, "SI peephole optimizations",
69+
false, false)
5770

5871
char SIPreEmitPeephole::ID = 0;
5972

@@ -299,61 +312,21 @@ bool SIPreEmitPeephole::getBlockDestinations(
299312
return true;
300313
}
301314

302-
namespace {
303-
class BranchWeightCostModel {
304-
const SIInstrInfo &TII;
305-
const TargetSchedModel &SchedModel;
306-
BranchProbability BranchProb;
307-
static constexpr uint64_t BranchNotTakenCost = 1;
308-
uint64_t BranchTakenCost;
309-
uint64_t ThenCyclesCost = 0;
310-
311-
public:
312-
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
313-
const MachineBasicBlock &Succ)
314-
: TII(TII), SchedModel(TII.getSchedModel()) {
315-
assert(SchedModel.hasInstrSchedModelOrItineraries());
316-
317-
const MachineBasicBlock &Head = *Branch.getParent();
318-
const auto *FromIt = find(Head.successors(), &Succ);
319-
assert(FromIt != Head.succ_end());
320-
321-
BranchProb = Head.getSuccProbability(FromIt);
322-
assert(!BranchProb.isUnknown());
323-
BranchTakenCost = SchedModel.computeInstrLatency(&Branch, false);
324-
}
325-
326-
bool isProfitable(const MachineInstr &MI) {
327-
if (TII.isWaitcnt(MI.getOpcode()))
328-
return false;
329-
330-
ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);
331-
332-
// Consider `P = N/D` to be the probability of execz being false (skipping
333-
// the then-block) The transformation is profitable if always executing the
334-
// 'then' block is cheaper than executing sometimes 'then' and always
335-
// executing s_cbranch_execz:
336-
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNonTakenCost
337-
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNonTakenCost
338-
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
339-
// BranchNonTakenCost
340-
uint64_t Numerator = BranchProb.getNumerator();
341-
uint64_t Denominator = BranchProb.getDenominator();
342-
return (Denominator - Numerator) * ThenCyclesCost <=
343-
((Denominator - Numerator) * BranchTakenCost +
344-
Numerator * BranchNotTakenCost);
345-
}
346-
};
347-
348-
bool SIPreEmitPeephole::mustRetainExeczBranch(
349-
const MachineInstr &Branch, const MachineBasicBlock &From,
350-
const MachineBasicBlock &To) const {
315+
bool SIPreEmitPeephole::mustRetainExeczBranch(const MachineInstr &Branch,
316+
const MachineBasicBlock &From,
317+
const MachineBasicBlock &To) {
351318

352319
const MachineBasicBlock &Head = *Branch.getParent();
353-
assert(is_contained(Head.successors(), &From));
320+
const auto *FromIt = find(Head.successors(), &From);
321+
assert(FromIt != Head.succ_end());
354322

355-
BranchWeightCostModel CostModel{*TII, Branch, From};
323+
auto BranchProb = Head.getSuccProbability(FromIt);
324+
assert(!BranchProb.isUnknown());
325+
uint64_t BranchTakenCost =
326+
TII->getSchedModel().computeInstrLatency(&Branch, false);
327+
constexpr uint64_t BranchNotTakenCost = 1;
356328

329+
unsigned ThenCyclesCost = 0;
357330
const MachineFunction *MF = From.getParent();
358331
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
359332
MBBI != End && MBBI != ToI; ++MBBI) {
@@ -372,14 +345,36 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
372345
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
373346
return true;
374347

375-
if (!CostModel.isProfitable(MI))
348+
if (TII->isWaitcnt(MI.getOpcode()))
376349
return true;
377350
}
351+
352+
if (!MinInstr)
353+
MinInstr = Traces->getEnsemble(MachineTraceStrategy::TS_Local);
354+
355+
MachineTraceMetrics::Trace Trace = MinInstr->getTrace(&From);
356+
ThenCyclesCost +=
357+
std::max(Trace.getCriticalPath(), Trace.getResourceDepth(true));
358+
359+
// Consider `P = N/D` to be the probability of execz being false (skipping
360+
// the then-block) The transformation is profitable if always executing the
361+
// 'then' block is cheaper than executing sometimes 'then' and always
362+
// executing s_cbranch_execz:
363+
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNonTakenCost
364+
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNonTakenCost
365+
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
366+
// BranchNonTakenCost
367+
uint64_t Numerator = BranchProb.getNumerator();
368+
uint64_t Denominator = BranchProb.getDenominator();
369+
bool IsProfitable = (Denominator - Numerator) * ThenCyclesCost <=
370+
((Denominator - Numerator) * BranchTakenCost +
371+
Numerator * BranchNotTakenCost);
372+
if (!IsProfitable)
373+
return true;
378374
}
379375

380376
return false;
381377
}
382-
} // namespace
383378

384379
// Returns true if the skip branch instruction is removed.
385380
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
@@ -414,6 +409,8 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
414409
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
415410
TII = ST.getInstrInfo();
416411
TRI = &TII->getRegisterInfo();
412+
Traces = &getAnalysis<MachineTraceMetrics>();
413+
MinInstr = nullptr;
417414
bool Changed = false;
418415

419416
MF.RenumberBlocks();

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -249,11 +249,10 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
249249
; GFX10-NEXT: .LBB3_6: ; %Flow1
250250
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
251251
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
252-
; GFX10-NEXT: s_cbranch_execz .LBB3_8
253252
; GFX10-NEXT: ; %bb.7: ; %block.after.loop
254253
; GFX10-NEXT: v_mov_b32_e32 v0, 5
255254
; GFX10-NEXT: flat_store_dword v[3:4], v0
256-
; GFX10-NEXT: .LBB3_8: ; %exit
255+
; GFX10-NEXT: ; %bb.8: ; %exit
257256
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
258257
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
259258
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -315,15 +314,14 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
315314
; GFX10-NEXT: v_mov_b32_e32 v4, v5
316315
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
317316
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
318-
; GFX10-NEXT: s_cbranch_execz .LBB4_4
319317
; GFX10-NEXT: ; %bb.3: ; %if.block.0
320318
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
321319
; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4
322320
; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[4:5]
323321
; GFX10-NEXT: v_add_co_u32 v8, s4, v2, v8
324322
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v3, v9, s4
325323
; GFX10-NEXT: global_store_dword v[8:9], v4, off
326-
; GFX10-NEXT: .LBB4_4: ; %loop.break.block
324+
; GFX10-NEXT: ; %bb.4: ; %loop.break.block
327325
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
328326
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
329327
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
@@ -342,10 +340,9 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
342340
; GFX10-NEXT: .LBB4_6: ; %cond.block.1
343341
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
344342
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
345-
; GFX10-NEXT: s_cbranch_execz .LBB4_8
346343
; GFX10-NEXT: ; %bb.7: ; %if.block.1
347344
; GFX10-NEXT: global_store_dword v[6:7], v4, off
348-
; GFX10-NEXT: .LBB4_8: ; %exit
345+
; GFX10-NEXT: ; %bb.8: ; %exit
349346
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
350347
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
351348
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -536,11 +533,10 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
536533
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
537534
; GFX10-NEXT: s_and_saveexec_b32 s0, s1
538535
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
539-
; GFX10-NEXT: s_cbranch_execz .LBB6_6
540536
; GFX10-NEXT: ; %bb.5: ; %break.body
541537
; GFX10-NEXT: v_mov_b32_e32 v0, 10
542538
; GFX10-NEXT: global_store_dword v[4:5], v0, off
543-
; GFX10-NEXT: .LBB6_6: ; %exit
539+
; GFX10-NEXT: ; %bb.6: ; %exit
544540
; GFX10-NEXT: s_endpgm
545541
entry:
546542
br label %A

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -437,11 +437,10 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad
437437
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
438438
; GFX10-NEXT: s_and_saveexec_b32 s0, s1
439439
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
440-
; GFX10-NEXT: s_cbranch_execz .LBB5_6
441440
; GFX10-NEXT: ; %bb.5: ; %break.body
442441
; GFX10-NEXT: v_mov_b32_e32 v0, 10
443442
; GFX10-NEXT: global_store_dword v[4:5], v0, off
444-
; GFX10-NEXT: .LBB5_6: ; %exit
443+
; GFX10-NEXT: ; %bb.6: ; %exit
445444
; GFX10-NEXT: s_endpgm
446445
entry:
447446
br label %A

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,12 +152,11 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
152152
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
153153
; GFX10-NEXT: s_and_saveexec_b32 s1, s0
154154
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
155-
; GFX10-NEXT: s_cbranch_execz .LBB2_7
156155
; GFX10-NEXT: ; %bb.6: ; %break.body
157156
; GFX10-NEXT: v_mov_b32_e32 v0, 10
158157
; GFX10-NEXT: v_mov_b32_e32 v1, 0
159158
; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
160-
; GFX10-NEXT: .LBB2_7: ; %exit
159+
; GFX10-NEXT: ; %bb.7: ; %exit
161160
; GFX10-NEXT: s_endpgm
162161
entry:
163162
br label %A

llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,9 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
6868
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
6969
; GFX906-NEXT: global_load_dword v1, v2, s[4:5]
7070
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
71-
; GFX906-NEXT: s_cbranch_execz .LBB1_2
7271
; GFX906-NEXT: ; %bb.1: ; %bb.1
7372
; GFX906-NEXT: global_load_dword v1, v2, s[6:7]
74-
; GFX906-NEXT: .LBB1_2: ; %bb.2
73+
; GFX906-NEXT: ; %bb.2: ; %bb.2
7574
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
7675
; GFX906-NEXT: v_mov_b32_e32 v0, 0
7776
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -149,10 +148,9 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
149148
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
150149
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
151150
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
152-
; GFX906-NEXT: s_cbranch_execz .LBB3_2
153151
; GFX906-NEXT: ; %bb.1: ; %bb.1
154152
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
155-
; GFX906-NEXT: .LBB3_2: ; %bb.2
153+
; GFX906-NEXT: ; %bb.2: ; %bb.2
156154
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
157155
; GFX906-NEXT: v_mov_b32_e32 v0, 0
158156
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -185,10 +183,9 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
185183
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
186184
; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5]
187185
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
188-
; GFX906-NEXT: s_cbranch_execz .LBB4_2
189186
; GFX906-NEXT: ; %bb.1: ; %bb.1
190187
; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7]
191-
; GFX906-NEXT: .LBB4_2: ; %bb.2
188+
; GFX906-NEXT: ; %bb.2: ; %bb.2
192189
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
193190
; GFX906-NEXT: v_mov_b32_e32 v0, 0
194191
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -222,11 +219,10 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
222219
; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5]
223220
; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16
224221
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
225-
; GFX906-NEXT: s_cbranch_execz .LBB5_2
226222
; GFX906-NEXT: ; %bb.1: ; %bb.1
227223
; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7]
228224
; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16
229-
; GFX906-NEXT: .LBB5_2: ; %bb.2
225+
; GFX906-NEXT: ; %bb.2: ; %bb.2
230226
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
231227
; GFX906-NEXT: v_mov_b32_e32 v0, 0
232228
; GFX906-NEXT: s_waitcnt vmcnt(1)
@@ -486,14 +482,13 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
486482
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
487483
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
488484
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
489-
; GFX906-NEXT: s_cbranch_execz .LBB8_2
490485
; GFX906-NEXT: ; %bb.1: ; %bb.1
491486
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
492487
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
493488
; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
494489
; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc
495490
; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
496-
; GFX906-NEXT: .LBB8_2: ; %Flow
491+
; GFX906-NEXT: ; %bb.2: ; %Flow
497492
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
498493
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
499494
; GFX906-NEXT: s_cbranch_execz .LBB8_4
@@ -547,11 +542,10 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
547542
; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[6:7]
548543
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
549544
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
550-
; GFX906-NEXT: s_cbranch_execz .LBB9_3
551545
; GFX906-NEXT: ; %bb.2: ; %bb.2
552546
; GFX906-NEXT: v_mov_b32_e32 v0, 0
553547
; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9]
554-
; GFX906-NEXT: .LBB9_3: ; %Flow
548+
; GFX906-NEXT: ; %bb.3: ; %Flow
555549
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
556550
; GFX906-NEXT: .LBB9_4: ; %bb.3
557551
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]

0 commit comments

Comments
 (0)