Skip to content

Commit 54796a9

Browse files
committed
[AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: estimate ThenBlock cost using MachineTraceInfo
1 parent 1aa48af commit 54796a9

31 files changed

+226
-176
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 57 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
#include "GCNSubtarget.h"
1616
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1717
#include "llvm/CodeGen/MachineFunctionPass.h"
18+
#include "llvm/CodeGen/MachineTraceMetrics.h"
1819
#include "llvm/CodeGen/TargetSchedule.h"
20+
#include "llvm/InitializePasses.h"
1921
#include "llvm/Support/BranchProbability.h"
2022

2123
using namespace llvm;
@@ -29,6 +31,10 @@ class SIPreEmitPeephole : public MachineFunctionPass {
2931
const SIInstrInfo *TII = nullptr;
3032
const SIRegisterInfo *TRI = nullptr;
3133

34+
// Trace metrics analysis result, used to estimate the number of cycles it
35+
// takes to execute a block.
36+
MachineTraceMetrics::Ensemble *Traces;
37+
3238
bool optimizeVccBranch(MachineInstr &MI) const;
3339
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
3440
bool getBlockDestinations(MachineBasicBlock &SrcMBB,
@@ -37,9 +43,14 @@ class SIPreEmitPeephole : public MachineFunctionPass {
3743
SmallVectorImpl<MachineOperand> &Cond);
3844
bool mustRetainExeczBranch(const MachineInstr &Branch,
3945
const MachineBasicBlock &From,
40-
const MachineBasicBlock &To) const;
46+
const MachineBasicBlock &To);
4147
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
4248

49+
void getAnalysisUsage(AnalysisUsage &AU) const override {
50+
AU.addRequired<MachineTraceMetricsWrapperPass>();
51+
MachineFunctionPass::getAnalysisUsage(AU);
52+
}
53+
4354
public:
4455
static char ID;
4556

@@ -52,8 +63,11 @@ class SIPreEmitPeephole : public MachineFunctionPass {
5263

5364
} // End anonymous namespace.
5465

55-
INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
56-
"SI peephole optimizations", false, false)
66+
INITIALIZE_PASS_BEGIN(SIPreEmitPeephole, DEBUG_TYPE,
67+
"SI peephole optimizations", false, false)
68+
INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
69+
INITIALIZE_PASS_END(SIPreEmitPeephole, DEBUG_TYPE, "SI peephole optimizations",
70+
false, false)
5771

5872
char SIPreEmitPeephole::ID = 0;
5973

@@ -299,58 +313,23 @@ bool SIPreEmitPeephole::getBlockDestinations(
299313
return true;
300314
}
301315

302-
namespace {
303-
class BranchWeightCostModel {
304-
const SIInstrInfo &TII;
305-
const TargetSchedModel &SchedModel;
306-
BranchProbability BranchProb;
307-
static constexpr uint64_t BranchNotTakenCost = 1;
308-
uint64_t BranchTakenCost;
309-
uint64_t ThenCyclesCost = 0;
316+
bool SIPreEmitPeephole::mustRetainExeczBranch(const MachineInstr &Branch,
317+
const MachineBasicBlock &From,
318+
const MachineBasicBlock &To) {
319+
assert(is_contained(Branch.getParent()->successors(), &From));
310320

311-
public:
312-
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
313-
const MachineBasicBlock &Succ)
314-
: TII(TII), SchedModel(TII.getSchedModel()) {
315-
const MachineBasicBlock &Head = *Branch.getParent();
316-
const auto *FromIt = find(Head.successors(), &Succ);
317-
assert(FromIt != Head.succ_end());
318-
319-
BranchProb = Head.getSuccProbability(FromIt);
320-
if (BranchProb.isUnknown())
321-
BranchProb = BranchProbability::getZero();
322-
BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
323-
}
321+
const MachineBasicBlock &Head = *Branch.getParent();
322+
const auto *FromIt = find(Head.successors(), &From);
323+
assert(FromIt != Head.succ_end());
324324

325-
bool isProfitable(const MachineInstr &MI) {
326-
if (TII.isWaitcnt(MI.getOpcode()))
327-
return false;
325+
auto BranchProb = Head.getSuccProbability(FromIt);
326+
if (BranchProb.isUnknown())
327+
return true;
328328

329-
ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
330-
331-
// Consider `P = N/D` to be the probability of execz being false (skipping
332-
// the then-block) The transformation is profitable if always executing the
333-
// 'then' block is cheaper than executing sometimes 'then' and always
334-
// executing s_cbranch_execz:
335-
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
336-
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
337-
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
338-
// BranchNotTakenCost
339-
uint64_t Numerator = BranchProb.getNumerator();
340-
uint64_t Denominator = BranchProb.getDenominator();
341-
return (Denominator - Numerator) * ThenCyclesCost <=
342-
((Denominator - Numerator) * BranchTakenCost +
343-
Numerator * BranchNotTakenCost);
344-
}
345-
};
329+
const MachineFunction *MF = From.getParent();
346330

347-
bool SIPreEmitPeephole::mustRetainExeczBranch(
348-
const MachineInstr &Branch, const MachineBasicBlock &From,
349-
const MachineBasicBlock &To) const {
350-
assert(is_contained(Branch.getParent()->successors(), &From));
351-
BranchWeightCostModel CostModel{*TII, Branch, From};
331+
SmallVector<const MachineBasicBlock *> ThenBlocks;
352332

353-
const MachineFunction *MF = From.getParent();
354333
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
355334
MBBI != End && MBBI != ToI; ++MBBI) {
356335
const MachineBasicBlock &MBB = *MBBI;
@@ -372,14 +351,37 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
372351
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
373352
return true;
374353

375-
if (!CostModel.isProfitable(MI))
354+
if (TII->isWaitcnt(MI.getOpcode()))
376355
return true;
377356
}
357+
ThenBlocks.push_back(&MBB);
378358
}
379359

380-
return false;
360+
MachineTraceMetrics::Trace Trace = Traces->getTrace(&Head);
361+
const MCSchedClassDesc *BranchSchedClassDesc =
362+
TII->getSchedModel().getMCSchedModel()->getSchedClassDesc(
363+
Branch.getDesc().getSchedClass());
364+
unsigned ResourceThenWithoutBranch =
365+
Trace.getResourceLength(ThenBlocks, {}, {BranchSchedClassDesc});
366+
unsigned ResourceThenWithBranch = Trace.getResourceLength(ThenBlocks, {}, {});
367+
unsigned ResourceElseWithBranch = Trace.getResourceLength({}, {}, {});
368+
369+
// Consider `P = N/D` to be the probability of execz being false (skipping
370+
// the then-block) The transformation is profitable if always executing the
371+
// 'then' block is cheaper than executing sometimes 'then' and always
372+
// executing s_cbranch_execz:
373+
// * ThenCost <= P*BranchThenCost + (1-P)*BranchElseCost
374+
// * D * ThenCost <= N * BranchThenCost + (D - N) * BranchElseCost
375+
// For the resource lenght to be equivalent to the number of cycles to execute
376+
// the block, we assume no data-dependencies between the instructions. This
377+
// may not be true and should be refined.
378+
uint64_t Numerator = BranchProb.getNumerator();
379+
uint64_t Denominator = BranchProb.getDenominator();
380+
bool IsProfitable = Denominator * ResourceThenWithoutBranch <=
381+
Numerator * ResourceThenWithBranch +
382+
(Denominator - Numerator) * ResourceElseWithBranch;
383+
return !IsProfitable;
381384
}
382-
} // namespace
383385

384386
// Returns true if the skip branch instruction is removed.
385387
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
@@ -414,6 +416,8 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
414416
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
415417
TII = ST.getInstrInfo();
416418
TRI = &TII->getRegisterInfo();
419+
Traces = getAnalysis<MachineTraceMetricsWrapperPass>().getMTM().getEnsemble(
420+
llvm::MachineTraceStrategy::TS_MinInstrCount);
417421
bool Changed = false;
418422

419423
MF.RenumberBlocks();

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -249,11 +249,10 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
249249
; GFX10-NEXT: .LBB3_6: ; %Flow1
250250
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
251251
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
252-
; GFX10-NEXT: s_cbranch_execz .LBB3_8
253252
; GFX10-NEXT: ; %bb.7: ; %block.after.loop
254253
; GFX10-NEXT: v_mov_b32_e32 v0, 5
255254
; GFX10-NEXT: flat_store_dword v[3:4], v0
256-
; GFX10-NEXT: .LBB3_8: ; %exit
255+
; GFX10-NEXT: ; %bb.8: ; %exit
257256
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
258257
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
259258
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -340,10 +339,9 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
340339
; GFX10-NEXT: .LBB4_6: ; %cond.block.1
341340
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
342341
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
343-
; GFX10-NEXT: s_cbranch_execz .LBB4_8
344342
; GFX10-NEXT: ; %bb.7: ; %if.block.1
345343
; GFX10-NEXT: global_store_dword v[6:7], v4, off
346-
; GFX10-NEXT: .LBB4_8: ; %exit
344+
; GFX10-NEXT: ; %bb.8: ; %exit
347345
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
348346
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
349347
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -534,11 +532,10 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
534532
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
535533
; GFX10-NEXT: s_and_saveexec_b32 s0, s1
536534
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
537-
; GFX10-NEXT: s_cbranch_execz .LBB6_6
538535
; GFX10-NEXT: ; %bb.5: ; %break.body
539536
; GFX10-NEXT: v_mov_b32_e32 v0, 10
540537
; GFX10-NEXT: global_store_dword v[4:5], v0, off
541-
; GFX10-NEXT: .LBB6_6: ; %exit
538+
; GFX10-NEXT: ; %bb.6: ; %exit
542539
; GFX10-NEXT: s_endpgm
543540
entry:
544541
br label %A

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,13 @@ define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid,
1010
; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6, v2
1111
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
1212
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
13+
; GFX10-NEXT: s_cbranch_execz .LBB0_2
1314
; GFX10-NEXT: ; %bb.1: ; %B
1415
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 1, v2
1516
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
1617
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
1718
; GFX10-NEXT: s_or_b32 s0, s0, s2
18-
; GFX10-NEXT: ; %bb.2: ; %exit
19+
; GFX10-NEXT: .LBB0_2: ; %exit
1920
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
2021
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0
2122
; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2
@@ -46,20 +47,22 @@ define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid,
4647
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
4748
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
4849
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
50+
; GFX10-NEXT: s_cbranch_execz .LBB1_2
4951
; GFX10-NEXT: ; %bb.1: ; %B
5052
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
5153
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
5254
; GFX10-NEXT: ; implicit-def: $vgpr2
5355
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
5456
; GFX10-NEXT: s_or_b32 s0, s0, s2
55-
; GFX10-NEXT: ; %bb.2: ; %Flow
57+
; GFX10-NEXT: .LBB1_2: ; %Flow
5658
; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
59+
; GFX10-NEXT: s_cbranch_execz .LBB1_4
5760
; GFX10-NEXT: ; %bb.3: ; %A
5861
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
5962
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
6063
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
6164
; GFX10-NEXT: s_or_b32 s0, s0, s2
62-
; GFX10-NEXT: ; %bb.4: ; %exit
65+
; GFX10-NEXT: .LBB1_4: ; %exit
6366
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
6467
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0
6568
; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2
@@ -437,11 +440,10 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad
437440
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
438441
; GFX10-NEXT: s_and_saveexec_b32 s0, s1
439442
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
440-
; GFX10-NEXT: s_cbranch_execz .LBB5_6
441443
; GFX10-NEXT: ; %bb.5: ; %break.body
442444
; GFX10-NEXT: v_mov_b32_e32 v0, 10
443445
; GFX10-NEXT: global_store_dword v[4:5], v0, off
444-
; GFX10-NEXT: .LBB5_6: ; %exit
446+
; GFX10-NEXT: ; %bb.6: ; %exit
445447
; GFX10-NEXT: s_endpgm
446448
entry:
447449
br label %A

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -433,20 +433,22 @@ define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid
433433
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
434434
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
435435
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
436+
; GFX10-NEXT: s_cbranch_execz .LBB20_2
436437
; GFX10-NEXT: ; %bb.1: ; %B
437438
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
438439
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
439440
; GFX10-NEXT: ; implicit-def: $vgpr2
440441
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
441442
; GFX10-NEXT: s_or_b32 s0, s0, s2
442-
; GFX10-NEXT: ; %bb.2: ; %Flow
443+
; GFX10-NEXT: .LBB20_2: ; %Flow
443444
; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
445+
; GFX10-NEXT: s_cbranch_execz .LBB20_4
444446
; GFX10-NEXT: ; %bb.3: ; %A
445447
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
446448
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
447449
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
448450
; GFX10-NEXT: s_or_b32 s0, s0, s2
449-
; GFX10-NEXT: ; %bb.4: ; %exit
451+
; GFX10-NEXT: .LBB20_4: ; %exit
450452
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
451453
; GFX10-NEXT: s_and_b32 s0, s0, exec_lo
452454
; GFX10-NEXT: v_mov_b32_e32 v2, s0
@@ -460,20 +462,22 @@ define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid
460462
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
461463
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3
462464
; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1
465+
; GFX11-NEXT: s_cbranch_execz .LBB20_2
463466
; GFX11-NEXT: ; %bb.1: ; %B
464467
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
465468
; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
466469
; GFX11-NEXT: ; implicit-def: $vgpr2
467470
; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo
468471
; GFX11-NEXT: s_or_b32 s0, s0, s2
469-
; GFX11-NEXT: ; %bb.2: ; %Flow
472+
; GFX11-NEXT: .LBB20_2: ; %Flow
470473
; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1
474+
; GFX11-NEXT: s_cbranch_execz .LBB20_4
471475
; GFX11-NEXT: ; %bb.3: ; %A
472476
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
473477
; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
474478
; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo
475479
; GFX11-NEXT: s_or_b32 s0, s0, s2
476-
; GFX11-NEXT: ; %bb.4: ; %exit
480+
; GFX11-NEXT: .LBB20_4: ; %exit
477481
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
478482
; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
479483
; GFX11-NEXT: v_mov_b32_e32 v2, s0

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -436,20 +436,22 @@ define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid
436436
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
437437
; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
438438
; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
439+
; CHECK-NEXT: s_cbranch_execz .LBB20_2
439440
; CHECK-NEXT: ; %bb.1: ; %B
440441
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2
441442
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
442443
; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc
443444
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
444445
; CHECK-NEXT: ; implicit-def: $vgpr2
445-
; CHECK-NEXT: ; %bb.2: ; %Flow
446+
; CHECK-NEXT: .LBB20_2: ; %Flow
446447
; CHECK-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
448+
; CHECK-NEXT: s_cbranch_execz .LBB20_4
447449
; CHECK-NEXT: ; %bb.3: ; %A
448450
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 1, v2
449451
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
450452
; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc
451453
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
452-
; CHECK-NEXT: ; %bb.4: ; %exit
454+
; CHECK-NEXT: .LBB20_4: ; %exit
453455
; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
454456
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
455457
; CHECK-NEXT: v_mov_b32_e32 v3, s1

llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,9 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
6868
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
6969
; GFX906-NEXT: global_load_dword v1, v2, s[0:1]
7070
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
71-
; GFX906-NEXT: s_cbranch_execz .LBB1_2
7271
; GFX906-NEXT: ; %bb.1: ; %bb.1
7372
; GFX906-NEXT: global_load_dword v1, v2, s[2:3]
74-
; GFX906-NEXT: .LBB1_2: ; %bb.2
73+
; GFX906-NEXT: ; %bb.2: ; %bb.2
7574
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
7675
; GFX906-NEXT: v_mov_b32_e32 v0, 0
7776
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -149,10 +148,9 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
149148
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
150149
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
151150
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
152-
; GFX906-NEXT: s_cbranch_execz .LBB3_2
153151
; GFX906-NEXT: ; %bb.1: ; %bb.1
154152
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3]
155-
; GFX906-NEXT: .LBB3_2: ; %bb.2
153+
; GFX906-NEXT: ; %bb.2: ; %bb.2
156154
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
157155
; GFX906-NEXT: v_mov_b32_e32 v0, 0
158156
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -185,10 +183,9 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
185183
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
186184
; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[0:1]
187185
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
188-
; GFX906-NEXT: s_cbranch_execz .LBB4_2
189186
; GFX906-NEXT: ; %bb.1: ; %bb.1
190187
; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[2:3]
191-
; GFX906-NEXT: .LBB4_2: ; %bb.2
188+
; GFX906-NEXT: ; %bb.2: ; %bb.2
192189
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
193190
; GFX906-NEXT: v_mov_b32_e32 v0, 0
194191
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -222,11 +219,10 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
222219
; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[0:1]
223220
; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[0:1] offset:16
224221
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
225-
; GFX906-NEXT: s_cbranch_execz .LBB5_2
226222
; GFX906-NEXT: ; %bb.1: ; %bb.1
227223
; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[2:3]
228224
; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[2:3] offset:16
229-
; GFX906-NEXT: .LBB5_2: ; %bb.2
225+
; GFX906-NEXT: ; %bb.2: ; %bb.2
230226
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
231227
; GFX906-NEXT: v_mov_b32_e32 v0, 0
232228
; GFX906-NEXT: s_waitcnt vmcnt(1)
@@ -547,11 +543,10 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
547543
; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11]
548544
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
549545
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
550-
; GFX906-NEXT: s_cbranch_execz .LBB9_3
551546
; GFX906-NEXT: ; %bb.2: ; %bb.2
552547
; GFX906-NEXT: v_mov_b32_e32 v0, 0
553548
; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13]
554-
; GFX906-NEXT: .LBB9_3: ; %Flow
549+
; GFX906-NEXT: ; %bb.3: ; %Flow
555550
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
556551
; GFX906-NEXT: .LBB9_4: ; %bb.3
557552
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]

0 commit comments

Comments
 (0)