Skip to content

Commit c62e2a2

Browse files
authored
StructurizeCFG: Optimize phi insertion during ssa reconstruction (#101301)
After investigating more while-break cases, I think we should try to optimize the way we reconstruct phi nodes. Previously, we reconstruct each phi nodes separately, but this is not optimal. For example: ``` header: %v.1 = phi float [ %v, %entry ], [ %v.2, %latch ] br i1 %cc, label %if, label %latch if: %v.if = fadd float %v.1, 1.0 br i1 %cc2, label %latch, label %exit latch: %v.2 = phi float [ %v.if, %if ], [ %v.1, %header ] br i1 %cc3, label %exit, label %header exit: %v.3 = phi float [ %v.2, %latch ], [ %v.if, %if ] ``` For this case, we have different copies of value `v`, but there is at most one copy of value `v` alive at any program point shown above. The existing ssa reconstruction will use the incoming values from the old deleted phi. Below is a possible output after ssa reconstruction. ``` header: %v.1 = phi float [ %v, %entry ], [ %v.loop, %Flow1 ] br i1 %cc, label %if, label %flow if: %v.if = fadd float %v.1, 1.0 br label %flow flow: %v.exit.if = phi float [ %v.if, %if ], [ undef, %header ] %v.latch = phi float [ %v.if, %if ], [ %v.1, %header ] latch: br label %flow1 flow1: %v.loop = phi float [ %v.latch, %latch ], [ undef, %Flow ] %v.exit = phi float [ %v.latch, %latch ], [ %v.exit.if, %Flow ] exit: %v.3 = phi float [ %v.exit, %flow1 ] ``` If we look closely, in order to reconstruct `v.1` `v.2` `v.3`, we are having two simultaneous copies of `v` alive at `flow` and `flow1`. We highly depend on register coalescer to coalesce them together. But register coalescer may not always be able to coalesce them because of the complexity in the chain of phi. On the other side, now that we have only one copy of `v` alive at any program point before the transform, why not simplify the phi network as much as we can? Look at the incoming values of these PHIs: ``` header if latch v.1: -- -- v.2 v.2: v.1 v.if -- v.3: -- v.if v.2 ``` If we let them share the same incoming values for these three different incoming blocks, then we would have only one copy of alive `v` at any program point after ssa reconstruction. Something like: ``` header: %v.1 = phi float [ %v, %entry ], [ %v.2, %Flow1 ] br i1 %cc, label %if, label %flow if: %v.if = fadd float %v.1, 1.0 br label %flow flow: %v.2 = phi float [ %v.if, %if ], [ %v.1, %header ] latch: br label %flow1 flow1: ... exit: %v.3 = phi float [ %v.2, %flow1 ] ```
1 parent 82f52d9 commit c62e2a2

File tree

5 files changed

+167
-92
lines changed

5 files changed

+167
-92
lines changed

llvm/lib/Transforms/Scalar/StructurizeCFG.cpp

Lines changed: 116 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "llvm/Transforms/Scalar/StructurizeCFG.h"
1010
#include "llvm/ADT/DenseMap.h"
11+
#include "llvm/ADT/EquivalenceClasses.h"
1112
#include "llvm/ADT/MapVector.h"
1213
#include "llvm/ADT/SCCIterator.h"
1314
#include "llvm/ADT/STLExtras.h"
@@ -288,6 +289,10 @@ class StructurizeCFG {
288289
void findUndefBlocks(BasicBlock *PHIBlock,
289290
const SmallSet<BasicBlock *, 8> &Incomings,
290291
SmallVector<BasicBlock *> &UndefBlks) const;
292+
293+
void mergeIfCompatible(EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A,
294+
PHINode *B);
295+
291296
void setPhiValues();
292297

293298
void simplifyAffectedPhis();
@@ -710,39 +715,131 @@ void StructurizeCFG::findUndefBlocks(
710715
}
711716
}
712717

718+
// If two phi nodes have compatible incoming values (for each
719+
// incoming block, either they have the same incoming value or only one phi
720+
// node has an incoming value), let them share the merged incoming values. The
721+
// merge process is guided by the equivalence information from \p PhiClasses.
722+
// The function will possibly update the incoming values of leader phi in
723+
// DeletedPhis.
724+
void StructurizeCFG::mergeIfCompatible(
725+
EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A, PHINode *B) {
726+
auto ItA = PhiClasses.findLeader(PhiClasses.insert(A));
727+
auto ItB = PhiClasses.findLeader(PhiClasses.insert(B));
728+
// They are already in the same class, no work needed.
729+
if (ItA == ItB)
730+
return;
731+
732+
PHINode *LeaderA = *ItA;
733+
PHINode *LeaderB = *ItB;
734+
BBValueVector &IncomingA = DeletedPhis[LeaderA->getParent()][LeaderA];
735+
BBValueVector &IncomingB = DeletedPhis[LeaderB->getParent()][LeaderB];
736+
737+
DenseMap<BasicBlock *, Value *> Mergeable(IncomingA.begin(), IncomingA.end());
738+
for (auto [BB, V] : IncomingB) {
739+
auto BBIt = Mergeable.find(BB);
740+
if (BBIt != Mergeable.end() && BBIt->second != V)
741+
return;
742+
// Either IncomingA does not have this value or IncomingA has the same
743+
// value.
744+
Mergeable.insert({BB, V});
745+
}
746+
747+
// Update the incoming value of leaderA.
748+
IncomingA.assign(Mergeable.begin(), Mergeable.end());
749+
PhiClasses.unionSets(ItA, ItB);
750+
}
751+
713752
/// Add the real PHI value as soon as everything is set up
714753
void StructurizeCFG::setPhiValues() {
715754
SmallVector<PHINode *, 8> InsertedPhis;
716755
SSAUpdater Updater(&InsertedPhis);
756+
DenseMap<BasicBlock *, SmallVector<BasicBlock *>> UndefBlksMap;
757+
758+
// Find phi nodes that have compatible incoming values (either they have
759+
// the same value for the same block or only one phi node has an incoming
760+
// value, see example below). We only search again the phi's that are
761+
// referenced by another phi, which is the case we care about.
762+
//
763+
// For example (-- means no incoming value):
764+
// phi1 : BB1:phi2 BB2:v BB3:--
765+
// phi2: BB1:-- BB2:v BB3:w
766+
//
767+
// Then we can merge these incoming values and let phi1, phi2 use the
768+
// same set of incoming values:
769+
//
770+
// phi1&phi2: BB1:phi2 BB2:v BB3:w
771+
//
772+
// By doing this, phi1 and phi2 would share more intermediate phi nodes.
773+
// This would help reduce the number of phi nodes during SSA reconstruction
774+
// and ultimately result in fewer COPY instructions.
775+
//
776+
// This should be correct, because if a phi node does not have incoming
777+
// value from certain block, this means the block is not the predecessor
778+
// of the parent block, so we actually don't care about its incoming value.
779+
EquivalenceClasses<PHINode *> PhiClasses;
780+
for (const auto &[To, From] : AddedPhis) {
781+
auto OldPhiIt = DeletedPhis.find(To);
782+
if (OldPhiIt == DeletedPhis.end())
783+
continue;
784+
785+
PhiMap &BlkPhis = OldPhiIt->second;
786+
SmallVector<BasicBlock *> &UndefBlks =
787+
UndefBlksMap.FindAndConstruct(To).second;
788+
SmallSet<BasicBlock *, 8> Incomings;
789+
790+
// Get the undefined blocks shared by all the phi nodes.
791+
if (!BlkPhis.empty()) {
792+
for (const auto &VI : BlkPhis.front().second)
793+
Incomings.insert(VI.first);
794+
findUndefBlocks(To, Incomings, UndefBlks);
795+
}
796+
797+
for (const auto &[Phi, Incomings] : OldPhiIt->second) {
798+
SmallVector<PHINode *> IncomingPHIs;
799+
for (const auto &[BB, V] : Incomings) {
800+
// First, for each phi, check whether it has incoming value which is
801+
// another phi.
802+
if (PHINode *P = dyn_cast<PHINode>(V))
803+
IncomingPHIs.push_back(P);
804+
}
805+
806+
for (auto *OtherPhi : IncomingPHIs) {
807+
// Skip phis that are unrelated to the phi reconstruction for now.
808+
if (!DeletedPhis.contains(OtherPhi->getParent()))
809+
continue;
810+
mergeIfCompatible(PhiClasses, Phi, OtherPhi);
811+
}
812+
}
813+
}
814+
717815
for (const auto &AddedPhi : AddedPhis) {
718816
BasicBlock *To = AddedPhi.first;
719817
const BBVector &From = AddedPhi.second;
720818

721819
if (!DeletedPhis.count(To))
722820
continue;
723821

724-
SmallVector<BasicBlock *> UndefBlks;
725-
bool CachedUndefs = false;
726822
PhiMap &Map = DeletedPhis[To];
727-
for (const auto &PI : Map) {
728-
PHINode *Phi = PI.first;
823+
SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To];
824+
for (const auto &[Phi, Incoming] : Map) {
729825
Value *Undef = UndefValue::get(Phi->getType());
730826
Updater.Initialize(Phi->getType(), "");
731827
Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
732828
Updater.AddAvailableValue(To, Undef);
733829

734-
SmallSet<BasicBlock *, 8> Incomings;
735-
SmallVector<BasicBlock *> ConstantPreds;
736-
for (const auto &VI : PI.second) {
737-
Incomings.insert(VI.first);
738-
Updater.AddAvailableValue(VI.first, VI.second);
739-
if (isa<Constant>(VI.second))
740-
ConstantPreds.push_back(VI.first);
741-
}
830+
// Use leader phi's incoming if there is.
831+
auto LeaderIt = PhiClasses.findLeader(Phi);
832+
bool UseIncomingOfLeader =
833+
LeaderIt != PhiClasses.member_end() && *LeaderIt != Phi;
834+
const auto &IncomingMap =
835+
UseIncomingOfLeader ? DeletedPhis[(*LeaderIt)->getParent()][*LeaderIt]
836+
: Incoming;
742837

743-
if (!CachedUndefs) {
744-
findUndefBlocks(To, Incomings, UndefBlks);
745-
CachedUndefs = true;
838+
SmallVector<BasicBlock *> ConstantPreds;
839+
for (const auto &[BB, V] : IncomingMap) {
840+
Updater.AddAvailableValue(BB, V);
841+
if (isa<Constant>(V))
842+
ConstantPreds.push_back(BB);
746843
}
747844

748845
for (auto UB : UndefBlks) {
@@ -753,17 +850,18 @@ void StructurizeCFG::setPhiValues() {
753850
if (any_of(ConstantPreds,
754851
[&](BasicBlock *CP) { return DT->dominates(CP, UB); }))
755852
continue;
853+
// Maybe already get a value through sharing with other phi nodes.
854+
if (Updater.HasValueForBlock(UB))
855+
continue;
856+
756857
Updater.AddAvailableValue(UB, Undef);
757858
}
758859

759860
for (BasicBlock *FI : From)
760861
Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI));
761862
AffectedPhis.push_back(Phi);
762863
}
763-
764-
DeletedPhis.erase(To);
765864
}
766-
assert(DeletedPhis.empty());
767865

768866
AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
769867
}

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -577,11 +577,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
577577
; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
578578
; GFX908-NEXT: v_mov_b32_e32 v4, s8
579579
; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6
580-
; GFX908-NEXT: v_mov_b32_e32 v8, s8
581580
; GFX908-NEXT: v_mov_b32_e32 v6, s8
581+
; GFX908-NEXT: v_mov_b32_e32 v8, s8
582582
; GFX908-NEXT: v_mov_b32_e32 v5, s9
583-
; GFX908-NEXT: v_mov_b32_e32 v9, s9
584583
; GFX908-NEXT: v_mov_b32_e32 v7, s9
584+
; GFX908-NEXT: v_mov_b32_e32 v9, s9
585585
; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
586586
; GFX908-NEXT: v_mov_b32_e32 v11, v5
587587
; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11]
@@ -642,10 +642,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
642642
; GFX908-NEXT: v_add_f32_e32 v12, v20, v12
643643
; GFX908-NEXT: v_add_f32_e32 v5, v5, v25
644644
; GFX908-NEXT: v_add_f32_e32 v4, v4, v24
645-
; GFX908-NEXT: v_add_f32_e32 v9, v9, v27
646-
; GFX908-NEXT: v_add_f32_e32 v8, v8, v26
647-
; GFX908-NEXT: v_add_f32_e32 v6, v6, v14
648-
; GFX908-NEXT: v_add_f32_e32 v7, v7, v15
645+
; GFX908-NEXT: v_add_f32_e32 v7, v7, v27
646+
; GFX908-NEXT: v_add_f32_e32 v6, v6, v26
647+
; GFX908-NEXT: v_add_f32_e32 v8, v8, v14
648+
; GFX908-NEXT: v_add_f32_e32 v9, v9, v15
649649
; GFX908-NEXT: v_add_f32_e32 v10, v10, v12
650650
; GFX908-NEXT: v_add_f32_e32 v11, v11, v13
651651
; GFX908-NEXT: s_mov_b64 s[20:21], -1
@@ -655,10 +655,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
655655
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21]
656656
; GFX908-NEXT: s_cbranch_vccz .LBB3_4
657657
; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
658-
; GFX908-NEXT: ; implicit-def: $vgpr10_vgpr11
659-
; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7
660-
; GFX908-NEXT: ; implicit-def: $vgpr8_vgpr9
661-
; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
662658
; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
663659
; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19
664660
; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard
@@ -744,8 +740,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
744740
; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1]
745741
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
746742
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8
747-
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
748743
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
744+
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
749745
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
750746
; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11]
751747
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
@@ -801,8 +797,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
801797
; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17]
802798
; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15]
803799
; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25]
804-
; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[26:27]
805-
; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[16:17]
800+
; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27]
801+
; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17]
806802
; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15]
807803
; GFX90A-NEXT: s_mov_b64 s[20:21], -1
808804
; GFX90A-NEXT: s_branch .LBB3_4
@@ -811,10 +807,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
811807
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21]
812808
; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
813809
; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
814-
; GFX90A-NEXT: ; implicit-def: $vgpr12_vgpr13
815-
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
816-
; GFX90A-NEXT: ; implicit-def: $vgpr10_vgpr11
817-
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
818810
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
819811
; GFX90A-NEXT: ; implicit-def: $sgpr18_sgpr19
820812
; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard

llvm/test/CodeGen/AMDGPU/while-break.ll

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,8 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
162162
; GCN-NEXT: s_branch .LBB2_2
163163
; GCN-NEXT: .LBB2_1: ; %Flow1
164164
; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
165-
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s1
166-
; GCN-NEXT: s_and_b32 s1, exec_lo, s4
165+
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
166+
; GCN-NEXT: s_and_b32 s1, exec_lo, s1
167167
; GCN-NEXT: s_or_b32 s2, s1, s2
168168
; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
169169
; GCN-NEXT: s_cbranch_execz .LBB2_6
@@ -190,20 +190,17 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
190190
; GCN-NEXT: .LBB2_4: ; %Flow
191191
; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
192192
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
193-
; GCN-NEXT: v_mov_b32_e32 v7, v6
194-
; GCN-NEXT: s_mov_b32 s4, -1
195-
; GCN-NEXT: s_and_saveexec_b32 s1, s3
193+
; GCN-NEXT: s_mov_b32 s1, -1
194+
; GCN-NEXT: s_and_saveexec_b32 s4, s3
196195
; GCN-NEXT: s_cbranch_execz .LBB2_1
197196
; GCN-NEXT: ; %bb.5: ; %latch
198197
; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
199198
; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v3
200-
; GCN-NEXT: v_mov_b32_e32 v7, v0
201199
; GCN-NEXT: s_add_i32 s0, s0, 1
202-
; GCN-NEXT: s_orn2_b32 s4, vcc_lo, exec_lo
200+
; GCN-NEXT: s_orn2_b32 s1, vcc_lo, exec_lo
203201
; GCN-NEXT: s_branch .LBB2_1
204202
; GCN-NEXT: .LBB2_6: ; %end
205203
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s2
206-
; GCN-NEXT: v_mov_b32_e32 v0, v7
207204
; GCN-NEXT: v_mov_b32_e32 v1, v6
208205
; GCN-NEXT: ; return to shader part epilog
209206
entry:

llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
2828
; CHECK-NEXT: [[I_INITIAL:%.*]] = load volatile i32, ptr addrspace(1) [[GEP]], align 4
2929
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
3030
; CHECK: LOOP.HEADER:
31-
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FLOW3:%.*]] ]
31+
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[FLOW3:%.*]] ]
3232
; CHECK-NEXT: call void asm sideeffect "s_nop 0x100b
3333
; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[I]] to i64
3434
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) null, i64 [[TMP12]]
@@ -49,8 +49,8 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
4949
; CHECK-NEXT: [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
5050
; CHECK-NEXT: br label [[INNER_LOOP:%.*]]
5151
; CHECK: Flow2:
52-
; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ]
53-
; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP9:%.*]], [[FLOW]] ]
52+
; CHECK-NEXT: [[TMP3]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP6:%.*]], [[FLOW]] ]
53+
; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ]
5454
; CHECK-NEXT: br i1 [[TMP4]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW3]]
5555
; CHECK: INNER_LOOP:
5656
; CHECK-NEXT: [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
@@ -66,20 +66,19 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
6666
; CHECK-NEXT: [[LOAD13:%.*]] = icmp uge i32 [[TMP16]], 271
6767
; CHECK-NEXT: br i1 [[LOAD13]], label [[INCREMENT_I]], label [[FLOW1:%.*]]
6868
; CHECK: Flow3:
69-
; CHECK-NEXT: [[TMP5]] = phi i32 [ [[TMP3]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW2]] ]
70-
; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
71-
; CHECK-NEXT: br i1 [[TMP6]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
69+
; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
70+
; CHECK-NEXT: br i1 [[TMP5]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
7271
; CHECK: Flow4:
73-
; CHECK-NEXT: br i1 [[TMP8:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
72+
; CHECK-NEXT: br i1 [[TMP7:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
7473
; CHECK: bb64:
7574
; CHECK-NEXT: call void asm sideeffect "s_nop 42", "~{memory}"() #[[ATTR0]]
7675
; CHECK-NEXT: br label [[RETURN]]
7776
; CHECK: Flow:
78-
; CHECK-NEXT: [[TMP7]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
79-
; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
80-
; CHECK-NEXT: [[TMP9]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
81-
; CHECK-NEXT: [[TMP10:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
82-
; CHECK-NEXT: br i1 [[TMP10]], label [[BB18]], label [[FLOW2]]
77+
; CHECK-NEXT: [[TMP6]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
78+
; CHECK-NEXT: [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
79+
; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
80+
; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
81+
; CHECK-NEXT: br i1 [[TMP9]], label [[BB18]], label [[FLOW2]]
8382
; CHECK: INCREMENT_I:
8483
; CHECK-NEXT: [[INC_I]] = add i32 [[I]], 1
8584
; CHECK-NEXT: call void asm sideeffect "s_nop 0x1336

0 commit comments

Comments
 (0)