llvm · jmmartinez · Feb 11, 2025
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -15,7 +15,9 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/BranchProbability.h"
 
 using namespace llvm;
@@ -29,6 +31,10 @@ class SIPreEmitPeephole : public MachineFunctionPass {
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
 
+  // Trace metrics analysis result, used to estimate the number of cycles it
+  // takes to execute a block.
+  MachineTraceMetrics::Ensemble *Traces;
+
   bool optimizeVccBranch(MachineInstr &MI) const;
   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
   bool getBlockDestinations(MachineBasicBlock &SrcMBB,
@@ -37,9 +43,14 @@ class SIPreEmitPeephole : public MachineFunctionPass {
                             SmallVectorImpl<MachineOperand> &Cond);
   bool mustRetainExeczBranch(const MachineInstr &Branch,
                              const MachineBasicBlock &From,
-                             const MachineBasicBlock &To) const;
+                             const MachineBasicBlock &To);
   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineTraceMetricsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
 public:
   static char ID;
 
@@ -52,8 +63,11 @@ class SIPreEmitPeephole : public MachineFunctionPass {
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
-                "SI peephole optimizations", false, false)
+INITIALIZE_PASS_BEGIN(SIPreEmitPeephole, DEBUG_TYPE,
+                      "SI peephole optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
+INITIALIZE_PASS_END(SIPreEmitPeephole, DEBUG_TYPE, "SI peephole optimizations",
+                    false, false)
 
 char SIPreEmitPeephole::ID = 0;
 
@@ -299,58 +313,23 @@ bool SIPreEmitPeephole::getBlockDestinations(
   return true;
 }
 
-namespace {
-class BranchWeightCostModel {
-  const SIInstrInfo &TII;
-  const TargetSchedModel &SchedModel;
-  BranchProbability BranchProb;
-  static constexpr uint64_t BranchNotTakenCost = 1;
-  uint64_t BranchTakenCost;
-  uint64_t ThenCyclesCost = 0;
+bool SIPreEmitPeephole::mustRetainExeczBranch(const MachineInstr &Branch,
+                                              const MachineBasicBlock &From,
+                                              const MachineBasicBlock &To) {
+  assert(is_contained(Branch.getParent()->successors(), &From));
 
-public:
-  BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
-                        const MachineBasicBlock &Succ)
-      : TII(TII), SchedModel(TII.getSchedModel()) {
-    const MachineBasicBlock &Head = *Branch.getParent();
-    const auto *FromIt = find(Head.successors(), &Succ);
-    assert(FromIt != Head.succ_end());
-
-    BranchProb = Head.getSuccProbability(FromIt);
-    if (BranchProb.isUnknown())
-      BranchProb = BranchProbability::getZero();
-    BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
-  }
+  const MachineBasicBlock &Head = *Branch.getParent();
+  const auto *FromIt = find(Head.successors(), &From);
+  assert(FromIt != Head.succ_end());
 
-  bool isProfitable(const MachineInstr &MI) {
-    if (TII.isWaitcnt(MI.getOpcode()))
-      return false;
+  auto BranchProb = Head.getSuccProbability(FromIt);
+  if (BranchProb.isUnknown())
+    return true;
 
-    ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
-
-    // Consider `P = N/D` to be the probability of execz being false (skipping
-    // the then-block) The transformation is profitable if always executing the
-    // 'then' block is cheaper than executing sometimes 'then' and always
-    // executing s_cbranch_execz:
-    // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
-    // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
-    // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
-    // BranchNotTakenCost
-    uint64_t Numerator = BranchProb.getNumerator();
-    uint64_t Denominator = BranchProb.getDenominator();
-    return (Denominator - Numerator) * ThenCyclesCost <=
-           ((Denominator - Numerator) * BranchTakenCost +
-            Numerator * BranchNotTakenCost);
-  }
-};
+  const MachineFunction *MF = From.getParent();
 
-bool SIPreEmitPeephole::mustRetainExeczBranch(
-    const MachineInstr &Branch, const MachineBasicBlock &From,
-    const MachineBasicBlock &To) const {
-  assert(is_contained(Branch.getParent()->successors(), &From));
-  BranchWeightCostModel CostModel{*TII, Branch, From};
+  SmallVector<const MachineBasicBlock *> ThenBlocks;
 
-  const MachineFunction *MF = From.getParent();
   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
        MBBI != End && MBBI != ToI; ++MBBI) {
     const MachineBasicBlock &MBB = *MBBI;
@@ -372,14 +351,37 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
       if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
         return true;
 
-      if (!CostModel.isProfitable(MI))
+      if (TII->isWaitcnt(MI.getOpcode()))
         return true;
     }
+    ThenBlocks.push_back(&MBB);
   }
 
-  return false;
+  MachineTraceMetrics::Trace Trace = Traces->getTrace(&Head);
+  const MCSchedClassDesc *BranchSchedClassDesc =
+      TII->getSchedModel().getMCSchedModel()->getSchedClassDesc(
+          Branch.getDesc().getSchedClass());
+  unsigned ResourceThenWithoutBranch =
+      Trace.getResourceLength(ThenBlocks, {}, {BranchSchedClassDesc});
+  unsigned ResourceThenWithBranch = Trace.getResourceLength(ThenBlocks, {}, {});
+  unsigned ResourceElseWithBranch = Trace.getResourceLength({}, {}, {});
+
+  // Consider `P = N/D` to be the probability of execz being false (skipping
+  // the then-block) The transformation is profitable if always executing the
+  // 'then' block is cheaper than executing sometimes 'then' and always
+  // executing s_cbranch_execz:
+  // * ThenCost <= P*BranchThenCost + (1-P)*BranchElseCost
+  // * D * ThenCost <= N * BranchThenCost + (D - N) * BranchElseCost
+  // For the resource lenght to be equivalent to the number of cycles to execute
+  // the block, we assume no data-dependencies between the instructions. This
+  // may not be true and should be refined.
+  uint64_t Numerator = BranchProb.getNumerator();
+  uint64_t Denominator = BranchProb.getDenominator();
+  bool IsProfitable = Denominator * ResourceThenWithoutBranch <=
+                      Numerator * ResourceThenWithBranch +
+                          (Denominator - Numerator) * ResourceElseWithBranch;
+  return !IsProfitable;
 }
-} // namespace
 
 // Returns true if the skip branch instruction is removed.
 bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
@@ -414,6 +416,8 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
+  Traces = getAnalysis<MachineTraceMetricsWrapperPass>().getMTM().getEnsemble(
+      llvm::MachineTraceStrategy::TS_MinInstrCount);
   bool Changed = false;
 
   MF.RenumberBlocks();

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -249,11 +249,10 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
 ; GFX10-NEXT:  .LBB3_6: ; %Flow1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s6
-; GFX10-NEXT:    s_cbranch_execz .LBB3_8
 ; GFX10-NEXT:  ; %bb.7: ; %block.after.loop
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 5
 ; GFX10-NEXT:    flat_store_dword v[3:4], v0
-; GFX10-NEXT:  .LBB3_8: ; %exit
+; GFX10-NEXT:  ; %bb.8: ; %exit
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -340,10 +339,9 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:  .LBB4_6: ; %cond.block.1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s6
-; GFX10-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX10-NEXT:  ; %bb.7: ; %if.block.1
 ; GFX10-NEXT:    global_store_dword v[6:7], v4, off
-; GFX10-NEXT:  .LBB4_8: ; %exit
+; GFX10-NEXT:  ; %bb.8: ; %exit
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -534,11 +532,10 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_and_saveexec_b32 s0, s1
 ; GFX10-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX10-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX10-NEXT:  ; %bb.5: ; %break.body
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 10
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
-; GFX10-NEXT:  .LBB6_6: ; %exit
+; GFX10-NEXT:  ; %bb.6: ; %exit
 ; GFX10-NEXT:    s_endpgm
 entry:
   br label %A

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -10,12 +10,13 @@ define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid,
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 6, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX10-NEXT:  ; %bb.1: ; %B
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 1, v2
 ; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
-; GFX10-NEXT:  ; %bb.2: ; %exit
+; GFX10-NEXT:  .LBB0_2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
@@ -46,20 +47,22 @@ define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid,
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %B
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
 ; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-NEXT:    ; implicit-def: $vgpr2
 ; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
-; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:  .LBB1_2: ; %Flow
 ; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX10-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX10-NEXT:  ; %bb.3: ; %A
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 1, v2
 ; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
-; GFX10-NEXT:  ; %bb.4: ; %exit
+; GFX10-NEXT:  .LBB1_4: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
@@ -437,11 +440,10 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_and_saveexec_b32 s0, s1
 ; GFX10-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX10-NEXT:    s_cbranch_execz .LBB5_6
 ; GFX10-NEXT:  ; %bb.5: ; %break.body
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 10
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
-; GFX10-NEXT:  .LBB5_6: ; %exit
+; GFX10-NEXT:  ; %bb.6: ; %exit
 ; GFX10-NEXT:    s_endpgm
 entry:
   br label %A

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -433,20 +433,22 @@ define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX10-NEXT:  ; %bb.1: ; %B
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
 ; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-NEXT:    ; implicit-def: $vgpr2
 ; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
-; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:  .LBB20_2: ; %Flow
 ; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX10-NEXT:    s_cbranch_execz .LBB20_4
 ; GFX10-NEXT:  ; %bb.3: ; %A
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 1, v2
 ; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
-; GFX10-NEXT:  ; %bb.4: ; %exit
+; GFX10-NEXT:  .LBB20_4: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    s_and_b32 s0, s0, exec_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
@@ -460,20 +462,22 @@ define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v3
 ; GFX11-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX11-NEXT:  ; %bb.1: ; %B
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
 ; GFX11-NEXT:    s_and_not1_b32 s0, s0, exec_lo
 ; GFX11-NEXT:    ; implicit-def: $vgpr2
 ; GFX11-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
 ; GFX11-NEXT:    s_or_b32 s0, s0, s2
-; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:  .LBB20_2: ; %Flow
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX11-NEXT:    s_cbranch_execz .LBB20_4
 ; GFX11-NEXT:  ; %bb.3: ; %A
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 1, v2
 ; GFX11-NEXT:    s_and_not1_b32 s0, s0, exec_lo
 ; GFX11-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
 ; GFX11-NEXT:    s_or_b32 s0, s0, s2
-; GFX11-NEXT:  ; %bb.4: ; %exit
+; GFX11-NEXT:  .LBB20_4: ; %exit
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s0

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -436,20 +436,22 @@ define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
 ; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; CHECK-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; CHECK-NEXT:    s_cbranch_execz .LBB20_2
 ; CHECK-NEXT:  ; %bb.1: ; %B
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v2
 ; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; CHECK-NEXT:    s_and_b64 s[4:5], exec, vcc
 ; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; CHECK-NEXT:    ; implicit-def: $vgpr2
-; CHECK-NEXT:  ; %bb.2: ; %Flow
+; CHECK-NEXT:  .LBB20_2: ; %Flow
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; CHECK-NEXT:    s_cbranch_execz .LBB20_4
 ; CHECK-NEXT:  ; %bb.3: ; %A
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, 1, v2
 ; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; CHECK-NEXT:    s_and_b64 s[4:5], exec, vcc
 ; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; CHECK-NEXT:  ; %bb.4: ; %exit
+; CHECK-NEXT:  .LBB20_4: ; %exit
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -68,10 +68,9 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dword v1, v2, s[2:3]
-; GFX906-NEXT:  .LBB1_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -149,10 +148,9 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
-; GFX906-NEXT:  .LBB3_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -185,10 +183,9 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[0:1]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[2:3]
-; GFX906-NEXT:  .LBB4_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -222,11 +219,10 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[0:1]
 ; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[0:1] offset:16
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[2:3]
 ; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[2:3] offset:16
-; GFX906-NEXT:  .LBB5_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -547,11 +543,10 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[10:11]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[12:13]
-; GFX906-NEXT:  .LBB9_3: ; %Flow
+; GFX906-NEXT:  ; %bb.3: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:  .LBB9_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]