Skip to content

Commit 54d31bd

Browse files
authored
Reapply "StructurizeCFG: Optimize phi insertion during ssa reconstruction (#101301)" (#114347)
This reverts commit be40c72.
1 parent 51a4f31 commit 54d31bd

File tree

6 files changed

+168
-96
lines changed

6 files changed

+168
-96
lines changed

llvm/lib/Transforms/Scalar/StructurizeCFG.cpp

Lines changed: 115 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "llvm/Transforms/Scalar/StructurizeCFG.h"
1010
#include "llvm/ADT/DenseMap.h"
11+
#include "llvm/ADT/EquivalenceClasses.h"
1112
#include "llvm/ADT/MapVector.h"
1213
#include "llvm/ADT/SCCIterator.h"
1314
#include "llvm/ADT/STLExtras.h"
@@ -325,6 +326,10 @@ class StructurizeCFG {
325326
void findUndefBlocks(BasicBlock *PHIBlock,
326327
const SmallSet<BasicBlock *, 8> &Incomings,
327328
SmallVector<BasicBlock *> &UndefBlks) const;
329+
330+
void mergeIfCompatible(EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A,
331+
PHINode *B);
332+
328333
void setPhiValues();
329334

330335
void simplifyAffectedPhis();
@@ -755,39 +760,130 @@ void StructurizeCFG::findUndefBlocks(
755760
}
756761
}
757762

763+
// If two phi nodes have compatible incoming values (for each
764+
// incoming block, either they have the same incoming value or only one phi
765+
// node has an incoming value), let them share the merged incoming values. The
766+
// merge process is guided by the equivalence information from \p PhiClasses.
767+
// The function will possibly update the incoming values of leader phi in
768+
// DeletedPhis.
769+
void StructurizeCFG::mergeIfCompatible(
770+
EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A, PHINode *B) {
771+
auto ItA = PhiClasses.findLeader(PhiClasses.insert(A));
772+
auto ItB = PhiClasses.findLeader(PhiClasses.insert(B));
773+
// They are already in the same class, no work needed.
774+
if (ItA == ItB)
775+
return;
776+
777+
PHINode *LeaderA = *ItA;
778+
PHINode *LeaderB = *ItB;
779+
BBValueVector &IncomingA = DeletedPhis[LeaderA->getParent()][LeaderA];
780+
BBValueVector &IncomingB = DeletedPhis[LeaderB->getParent()][LeaderB];
781+
782+
DenseMap<BasicBlock *, Value *> Mergeable(IncomingA.begin(), IncomingA.end());
783+
for (auto [BB, V] : IncomingB) {
784+
auto BBIt = Mergeable.find(BB);
785+
if (BBIt != Mergeable.end() && BBIt->second != V)
786+
return;
787+
// Either IncomingA does not have this value or IncomingA has the same
788+
// value.
789+
Mergeable.insert({BB, V});
790+
}
791+
792+
// Update the incoming value of leaderA.
793+
IncomingA.assign(Mergeable.begin(), Mergeable.end());
794+
PhiClasses.unionSets(ItA, ItB);
795+
}
796+
758797
/// Add the real PHI value as soon as everything is set up
759798
void StructurizeCFG::setPhiValues() {
760799
SmallVector<PHINode *, 8> InsertedPhis;
761800
SSAUpdater Updater(&InsertedPhis);
801+
DenseMap<BasicBlock *, SmallVector<BasicBlock *>> UndefBlksMap;
802+
803+
// Find phi nodes that have compatible incoming values (either they have
804+
// the same value for the same block or only one phi node has an incoming
805+
// value, see example below). We only search again the phi's that are
806+
// referenced by another phi, which is the case we care about.
807+
//
808+
// For example (-- means no incoming value):
809+
// phi1 : BB1:phi2 BB2:v BB3:--
810+
// phi2: BB1:-- BB2:v BB3:w
811+
//
812+
// Then we can merge these incoming values and let phi1, phi2 use the
813+
// same set of incoming values:
814+
//
815+
// phi1&phi2: BB1:phi2 BB2:v BB3:w
816+
//
817+
// By doing this, phi1 and phi2 would share more intermediate phi nodes.
818+
// This would help reduce the number of phi nodes during SSA reconstruction
819+
// and ultimately result in fewer COPY instructions.
820+
//
821+
// This should be correct, because if a phi node does not have incoming
822+
// value from certain block, this means the block is not the predecessor
823+
// of the parent block, so we actually don't care about its incoming value.
824+
EquivalenceClasses<PHINode *> PhiClasses;
825+
for (const auto &[To, From] : AddedPhis) {
826+
auto OldPhiIt = DeletedPhis.find(To);
827+
if (OldPhiIt == DeletedPhis.end())
828+
continue;
829+
830+
PhiMap &BlkPhis = OldPhiIt->second;
831+
SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To];
832+
SmallSet<BasicBlock *, 8> Incomings;
833+
834+
// Get the undefined blocks shared by all the phi nodes.
835+
if (!BlkPhis.empty()) {
836+
for (const auto &VI : BlkPhis.front().second)
837+
Incomings.insert(VI.first);
838+
findUndefBlocks(To, Incomings, UndefBlks);
839+
}
840+
841+
for (const auto &[Phi, Incomings] : OldPhiIt->second) {
842+
SmallVector<PHINode *> IncomingPHIs;
843+
for (const auto &[BB, V] : Incomings) {
844+
// First, for each phi, check whether it has incoming value which is
845+
// another phi.
846+
if (PHINode *P = dyn_cast<PHINode>(V))
847+
IncomingPHIs.push_back(P);
848+
}
849+
850+
for (auto *OtherPhi : IncomingPHIs) {
851+
// Skip phis that are unrelated to the phi reconstruction for now.
852+
if (!DeletedPhis.contains(OtherPhi->getParent()))
853+
continue;
854+
mergeIfCompatible(PhiClasses, Phi, OtherPhi);
855+
}
856+
}
857+
}
858+
762859
for (const auto &AddedPhi : AddedPhis) {
763860
BasicBlock *To = AddedPhi.first;
764861
const BBVector &From = AddedPhi.second;
765862

766863
if (!DeletedPhis.count(To))
767864
continue;
768865

769-
SmallVector<BasicBlock *> UndefBlks;
770-
bool CachedUndefs = false;
771866
PhiMap &Map = DeletedPhis[To];
772-
for (const auto &PI : Map) {
773-
PHINode *Phi = PI.first;
867+
SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To];
868+
for (const auto &[Phi, Incoming] : Map) {
774869
Value *Undef = UndefValue::get(Phi->getType());
775870
Updater.Initialize(Phi->getType(), "");
776871
Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
777872
Updater.AddAvailableValue(To, Undef);
778873

779-
SmallSet<BasicBlock *, 8> Incomings;
780-
SmallVector<BasicBlock *> ConstantPreds;
781-
for (const auto &VI : PI.second) {
782-
Incomings.insert(VI.first);
783-
Updater.AddAvailableValue(VI.first, VI.second);
784-
if (isa<Constant>(VI.second))
785-
ConstantPreds.push_back(VI.first);
786-
}
874+
// Use leader phi's incoming if there is.
875+
auto LeaderIt = PhiClasses.findLeader(Phi);
876+
bool UseIncomingOfLeader =
877+
LeaderIt != PhiClasses.member_end() && *LeaderIt != Phi;
878+
const auto &IncomingMap =
879+
UseIncomingOfLeader ? DeletedPhis[(*LeaderIt)->getParent()][*LeaderIt]
880+
: Incoming;
787881

788-
if (!CachedUndefs) {
789-
findUndefBlocks(To, Incomings, UndefBlks);
790-
CachedUndefs = true;
882+
SmallVector<BasicBlock *> ConstantPreds;
883+
for (const auto &[BB, V] : IncomingMap) {
884+
Updater.AddAvailableValue(BB, V);
885+
if (isa<Constant>(V))
886+
ConstantPreds.push_back(BB);
791887
}
792888

793889
for (auto UB : UndefBlks) {
@@ -798,17 +894,18 @@ void StructurizeCFG::setPhiValues() {
798894
if (any_of(ConstantPreds,
799895
[&](BasicBlock *CP) { return DT->dominates(CP, UB); }))
800896
continue;
897+
// Maybe already get a value through sharing with other phi nodes.
898+
if (Updater.HasValueForBlock(UB))
899+
continue;
900+
801901
Updater.AddAvailableValue(UB, Undef);
802902
}
803903

804904
for (BasicBlock *FI : From)
805905
Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI));
806906
AffectedPhis.push_back(Phi);
807907
}
808-
809-
DeletedPhis.erase(To);
810908
}
811-
assert(DeletedPhis.empty());
812909

813910
AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
814911
}

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
298298
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299299
; GFX10-NEXT: s_mov_b32 s5, 0
300300
; GFX10-NEXT: ; implicit-def: $sgpr6
301-
; GFX10-NEXT: v_mov_b32_e32 v5, s5
301+
; GFX10-NEXT: v_mov_b32_e32 v4, s5
302302
; GFX10-NEXT: s_branch .LBB4_2
303303
; GFX10-NEXT: .LBB4_1: ; %Flow
304304
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
@@ -312,7 +312,6 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
312312
; GFX10-NEXT: s_cbranch_execz .LBB4_6
313313
; GFX10-NEXT: .LBB4_2: ; %cond.block.0
314314
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
315-
; GFX10-NEXT: v_mov_b32_e32 v4, v5
316315
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
317316
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
318317
; GFX10-NEXT: s_cbranch_execz .LBB4_4
@@ -329,12 +328,11 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
329328
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
330329
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4
331330
; GFX10-NEXT: s_mov_b32 s7, -1
332-
; GFX10-NEXT: ; implicit-def: $vgpr5
333331
; GFX10-NEXT: s_and_saveexec_b32 s8, s4
334332
; GFX10-NEXT: s_cbranch_execz .LBB4_1
335333
; GFX10-NEXT: ; %bb.5: ; %loop.cond
336334
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
337-
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4
335+
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
338336
; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo
339337
; GFX10-NEXT: s_and_b32 s7, exec_lo, 0
340338
; GFX10-NEXT: s_or_b32 s7, s4, s7

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -576,11 +576,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
576576
; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
577577
; GFX908-NEXT: v_mov_b32_e32 v4, s8
578578
; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6
579-
; GFX908-NEXT: v_mov_b32_e32 v8, s8
580579
; GFX908-NEXT: v_mov_b32_e32 v6, s8
580+
; GFX908-NEXT: v_mov_b32_e32 v8, s8
581581
; GFX908-NEXT: v_mov_b32_e32 v5, s9
582-
; GFX908-NEXT: v_mov_b32_e32 v9, s9
583582
; GFX908-NEXT: v_mov_b32_e32 v7, s9
583+
; GFX908-NEXT: v_mov_b32_e32 v9, s9
584584
; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
585585
; GFX908-NEXT: v_mov_b32_e32 v11, v5
586586
; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11]
@@ -641,10 +641,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
641641
; GFX908-NEXT: v_add_f32_e32 v12, v20, v12
642642
; GFX908-NEXT: v_add_f32_e32 v5, v5, v25
643643
; GFX908-NEXT: v_add_f32_e32 v4, v4, v24
644-
; GFX908-NEXT: v_add_f32_e32 v9, v9, v27
645-
; GFX908-NEXT: v_add_f32_e32 v8, v8, v26
646-
; GFX908-NEXT: v_add_f32_e32 v6, v6, v14
647-
; GFX908-NEXT: v_add_f32_e32 v7, v7, v15
644+
; GFX908-NEXT: v_add_f32_e32 v7, v7, v27
645+
; GFX908-NEXT: v_add_f32_e32 v6, v6, v26
646+
; GFX908-NEXT: v_add_f32_e32 v8, v8, v14
647+
; GFX908-NEXT: v_add_f32_e32 v9, v9, v15
648648
; GFX908-NEXT: v_add_f32_e32 v10, v10, v12
649649
; GFX908-NEXT: v_add_f32_e32 v11, v11, v13
650650
; GFX908-NEXT: s_mov_b64 s[20:21], -1
@@ -654,10 +654,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
654654
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21]
655655
; GFX908-NEXT: s_cbranch_vccz .LBB3_4
656656
; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
657-
; GFX908-NEXT: ; implicit-def: $vgpr10_vgpr11
658-
; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7
659-
; GFX908-NEXT: ; implicit-def: $vgpr8_vgpr9
660-
; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
661657
; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
662658
; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19
663659
; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard
@@ -743,8 +739,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
743739
; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1]
744740
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
745741
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8
746-
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
747742
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
743+
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
748744
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
749745
; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11]
750746
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
@@ -800,8 +796,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
800796
; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17]
801797
; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15]
802798
; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25]
803-
; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[26:27]
804-
; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[16:17]
799+
; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27]
800+
; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17]
805801
; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15]
806802
; GFX90A-NEXT: s_mov_b64 s[20:21], -1
807803
; GFX90A-NEXT: s_branch .LBB3_4
@@ -810,10 +806,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
810806
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21]
811807
; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
812808
; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
813-
; GFX90A-NEXT: ; implicit-def: $vgpr12_vgpr13
814-
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
815-
; GFX90A-NEXT: ; implicit-def: $vgpr10_vgpr11
816-
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
817809
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
818810
; GFX90A-NEXT: ; implicit-def: $sgpr18_sgpr19
819811
; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard

llvm/test/CodeGen/AMDGPU/while-break.ll

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,8 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
162162
; GCN-NEXT: s_branch .LBB2_2
163163
; GCN-NEXT: .LBB2_1: ; %Flow1
164164
; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
165-
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s1
166-
; GCN-NEXT: s_and_b32 s1, exec_lo, s4
165+
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
166+
; GCN-NEXT: s_and_b32 s1, exec_lo, s1
167167
; GCN-NEXT: s_or_b32 s2, s1, s2
168168
; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
169169
; GCN-NEXT: s_cbranch_execz .LBB2_6
@@ -190,20 +190,17 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
190190
; GCN-NEXT: .LBB2_4: ; %Flow
191191
; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
192192
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
193-
; GCN-NEXT: v_mov_b32_e32 v7, v6
194-
; GCN-NEXT: s_mov_b32 s4, -1
195-
; GCN-NEXT: s_and_saveexec_b32 s1, s3
193+
; GCN-NEXT: s_mov_b32 s1, -1
194+
; GCN-NEXT: s_and_saveexec_b32 s4, s3
196195
; GCN-NEXT: s_cbranch_execz .LBB2_1
197196
; GCN-NEXT: ; %bb.5: ; %latch
198197
; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
199198
; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v3
200-
; GCN-NEXT: v_mov_b32_e32 v7, v0
201199
; GCN-NEXT: s_add_i32 s0, s0, 1
202-
; GCN-NEXT: s_orn2_b32 s4, vcc_lo, exec_lo
200+
; GCN-NEXT: s_orn2_b32 s1, vcc_lo, exec_lo
203201
; GCN-NEXT: s_branch .LBB2_1
204202
; GCN-NEXT: .LBB2_6: ; %end
205203
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s2
206-
; GCN-NEXT: v_mov_b32_e32 v0, v7
207204
; GCN-NEXT: v_mov_b32_e32 v1, v6
208205
; GCN-NEXT: ; return to shader part epilog
209206
entry:

llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
2828
; CHECK-NEXT: [[I_INITIAL:%.*]] = load volatile i32, ptr addrspace(1) [[GEP]], align 4
2929
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
3030
; CHECK: LOOP.HEADER:
31-
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FLOW3:%.*]] ]
31+
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[FLOW3:%.*]] ]
3232
; CHECK-NEXT: call void asm sideeffect "s_nop 0x100b
3333
; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[I]] to i64
3434
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) null, i64 [[TMP12]]
@@ -49,8 +49,8 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
4949
; CHECK-NEXT: [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
5050
; CHECK-NEXT: br label [[INNER_LOOP:%.*]]
5151
; CHECK: Flow2:
52-
; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ]
53-
; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP9:%.*]], [[FLOW]] ]
52+
; CHECK-NEXT: [[TMP3]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP6:%.*]], [[FLOW]] ]
53+
; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ]
5454
; CHECK-NEXT: br i1 [[TMP4]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW3]]
5555
; CHECK: INNER_LOOP:
5656
; CHECK-NEXT: [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
@@ -66,20 +66,19 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
6666
; CHECK-NEXT: [[LOAD13:%.*]] = icmp uge i32 [[TMP16]], 271
6767
; CHECK-NEXT: br i1 [[LOAD13]], label [[INCREMENT_I]], label [[FLOW1:%.*]]
6868
; CHECK: Flow3:
69-
; CHECK-NEXT: [[TMP5]] = phi i32 [ [[TMP3]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW2]] ]
70-
; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
71-
; CHECK-NEXT: br i1 [[TMP6]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
69+
; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
70+
; CHECK-NEXT: br i1 [[TMP5]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
7271
; CHECK: Flow4:
73-
; CHECK-NEXT: br i1 [[TMP8:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
72+
; CHECK-NEXT: br i1 [[TMP7:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
7473
; CHECK: bb64:
7574
; CHECK-NEXT: call void asm sideeffect "s_nop 42", "~{memory}"() #[[ATTR0]]
7675
; CHECK-NEXT: br label [[RETURN]]
7776
; CHECK: Flow:
78-
; CHECK-NEXT: [[TMP7]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
79-
; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
80-
; CHECK-NEXT: [[TMP9]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
81-
; CHECK-NEXT: [[TMP10:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
82-
; CHECK-NEXT: br i1 [[TMP10]], label [[BB18]], label [[FLOW2]]
77+
; CHECK-NEXT: [[TMP6]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
78+
; CHECK-NEXT: [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
79+
; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
80+
; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
81+
; CHECK-NEXT: br i1 [[TMP9]], label [[BB18]], label [[FLOW2]]
8382
; CHECK: INCREMENT_I:
8483
; CHECK-NEXT: [[INC_I]] = add i32 [[I]], 1
8584
; CHECK-NEXT: call void asm sideeffect "s_nop 0x1336

0 commit comments

Comments
 (0)