Skip to content

Commit 04b6996

Browse files
committed
[Utils][UnifyLoopExits] Avoid costly updates if nothing changed
If the ControlFlowHub did not perform any change to the control flow, there is no need to repair SSA, update the loop structure, and verify a bunch of things. This is not completely NFC though, repairSSA introduced PHI nodes with a single entry that are now missing. My code went from 400+ seconds to 1 second, since no loop required the exits to be unified, but there were many "complex" loops.
1 parent 6e7f042 commit 04b6996

File tree

6 files changed

+33
-23
lines changed

6 files changed

+33
-23
lines changed

llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,9 @@ struct ControlFlowHub {
110110
Branches.emplace_back(BB, Succ0, Succ1);
111111
}
112112

113-
BasicBlock *
113+
/// Return the unified loop exit block and a flag indicating if the CFG was
114+
/// changed at all.
115+
std::pair<BasicBlock *, bool>
114116
finalize(DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
115117
const StringRef Prefix,
116118
std::optional<unsigned> MaxControlFlowBooleans = std::nullopt);

llvm/lib/Transforms/Utils/ControlFlowUtils.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock,
270270
}
271271
}
272272

273-
BasicBlock *ControlFlowHub::finalize(
273+
std::pair<BasicBlock *, bool> ControlFlowHub::finalize(
274274
DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
275275
const StringRef Prefix, std::optional<unsigned> MaxControlFlowBooleans) {
276276
#ifndef NDEBUG
@@ -289,7 +289,7 @@ BasicBlock *ControlFlowHub::finalize(
289289
}
290290

291291
if (Outgoing.size() < 2)
292-
return Outgoing.front();
292+
return {Outgoing.front(), false};
293293

294294
SmallVector<DominatorTree::UpdateType, 16> Updates;
295295
if (DTU) {
@@ -338,5 +338,5 @@ BasicBlock *ControlFlowHub::finalize(
338338
Inst->eraseFromParent();
339339
}
340340

341-
return FirstGuardBlock;
341+
return {FirstGuardBlock, true};
342342
}

llvm/lib/Transforms/Utils/UnifyLoopExits.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,12 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
169169

170170
SmallVector<BasicBlock *, 8> GuardBlocks;
171171
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
172-
BasicBlock *LoopExitBlock = CHub.finalize(
172+
BasicBlock *LoopExitBlock;
173+
bool ChangedCFG;
174+
std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
173175
&DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue());
176+
// if (!ChangedCFG)
177+
// return false;
174178

175179
restoreSSA(DT, L, ExitingBlocks, LoopExitBlock);
176180

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
298298
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299299
; GFX10-NEXT: s_mov_b32 s5, 0
300300
; GFX10-NEXT: ; implicit-def: $sgpr6
301-
; GFX10-NEXT: v_mov_b32_e32 v4, s5
301+
; GFX10-NEXT: v_mov_b32_e32 v5, s5
302302
; GFX10-NEXT: s_branch .LBB4_2
303303
; GFX10-NEXT: .LBB4_1: ; %Flow
304304
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
@@ -312,6 +312,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
312312
; GFX10-NEXT: s_cbranch_execz .LBB4_6
313313
; GFX10-NEXT: .LBB4_2: ; %cond.block.0
314314
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
315+
; GFX10-NEXT: v_mov_b32_e32 v4, v5
315316
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
316317
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
317318
; GFX10-NEXT: s_cbranch_execz .LBB4_4
@@ -328,11 +329,12 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
328329
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
329330
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4
330331
; GFX10-NEXT: s_mov_b32 s7, -1
332+
; GFX10-NEXT: ; implicit-def: $vgpr5
331333
; GFX10-NEXT: s_and_saveexec_b32 s8, s4
332334
; GFX10-NEXT: s_cbranch_execz .LBB4_1
333335
; GFX10-NEXT: ; %bb.5: ; %loop.cond
334336
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
335-
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
337+
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4
336338
; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo
337339
; GFX10-NEXT: s_and_b32 s7, exec_lo, 0
338340
; GFX10-NEXT: s_or_b32 s7, s4, s7

llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7646,9 +7646,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
76467646
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
76477647
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
76487648
; GFX7-NEXT: s_cbranch_execnz .LBB28_2
7649-
; GFX7-NEXT: ; %bb.3: ; %Flow23
7649+
; GFX7-NEXT: ; %bb.3: ; %Flow22
76507650
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
7651-
; GFX7-NEXT: .LBB28_4: ; %Flow24
7651+
; GFX7-NEXT: .LBB28_4: ; %Flow23
76527652
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
76537653
; GFX7-NEXT: s_mov_b64 s[8:9], exec
76547654
; GFX7-NEXT: v_readfirstlane_b32 s10, v1
@@ -7676,7 +7676,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
76767676
; GFX7-NEXT: v_mov_b32_e32 v3, v4
76777677
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
76787678
; GFX7-NEXT: s_cbranch_execnz .LBB28_6
7679-
; GFX7-NEXT: .LBB28_7: ; %Flow22
7679+
; GFX7-NEXT: .LBB28_7: ; %Flow21
76807680
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
76817681
; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
76827682
; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
@@ -7725,7 +7725,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
77257725
; GFX7-NEXT: s_cbranch_execnz .LBB28_11
77267726
; GFX7-NEXT: ; %bb.12: ; %Flow
77277727
; GFX7-NEXT: s_or_b64 exec, exec, s[2:3]
7728-
; GFX7-NEXT: .LBB28_13: ; %Flow20
7728+
; GFX7-NEXT: .LBB28_13: ; %Flow19
77297729
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
77307730
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
77317731
; GFX7-NEXT: v_readfirstlane_b32 s4, v2
@@ -7770,9 +7770,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
77707770
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
77717771
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
77727772
; GFX6-NEXT: s_cbranch_execnz .LBB28_2
7773-
; GFX6-NEXT: ; %bb.3: ; %Flow21
7773+
; GFX6-NEXT: ; %bb.3: ; %Flow20
77747774
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
7775-
; GFX6-NEXT: .LBB28_4: ; %Flow22
7775+
; GFX6-NEXT: .LBB28_4: ; %Flow21
77767776
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
77777777
; GFX6-NEXT: s_mov_b64 s[8:9], exec
77787778
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
@@ -7800,7 +7800,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
78007800
; GFX6-NEXT: v_mov_b32_e32 v3, v4
78017801
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
78027802
; GFX6-NEXT: s_cbranch_execnz .LBB28_6
7803-
; GFX6-NEXT: .LBB28_7: ; %Flow20
7803+
; GFX6-NEXT: .LBB28_7: ; %Flow19
78047804
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
78057805
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
78067806
; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
@@ -7849,7 +7849,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
78497849
; GFX6-NEXT: s_cbranch_execnz .LBB28_11
78507850
; GFX6-NEXT: ; %bb.12: ; %Flow
78517851
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
7852-
; GFX6-NEXT: .LBB28_13: ; %Flow18
7852+
; GFX6-NEXT: .LBB28_13: ; %Flow17
78537853
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
78547854
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
78557855
; GFX6-NEXT: v_readfirstlane_b32 s4, v2
@@ -8483,9 +8483,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
84838483
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
84848484
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
84858485
; GFX7-NEXT: s_cbranch_execnz .LBB29_2
8486-
; GFX7-NEXT: ; %bb.3: ; %Flow23
8486+
; GFX7-NEXT: ; %bb.3: ; %Flow22
84878487
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
8488-
; GFX7-NEXT: .LBB29_4: ; %Flow24
8488+
; GFX7-NEXT: .LBB29_4: ; %Flow23
84898489
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
84908490
; GFX7-NEXT: s_mov_b64 s[8:9], exec
84918491
; GFX7-NEXT: v_readfirstlane_b32 s10, v1
@@ -8513,7 +8513,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
85138513
; GFX7-NEXT: v_mov_b32_e32 v3, v4
85148514
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
85158515
; GFX7-NEXT: s_cbranch_execnz .LBB29_6
8516-
; GFX7-NEXT: .LBB29_7: ; %Flow22
8516+
; GFX7-NEXT: .LBB29_7: ; %Flow21
85178517
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
85188518
; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
85198519
; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
@@ -8562,7 +8562,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
85628562
; GFX7-NEXT: s_cbranch_execnz .LBB29_11
85638563
; GFX7-NEXT: ; %bb.12: ; %Flow
85648564
; GFX7-NEXT: s_or_b64 exec, exec, s[2:3]
8565-
; GFX7-NEXT: .LBB29_13: ; %Flow20
8565+
; GFX7-NEXT: .LBB29_13: ; %Flow19
85668566
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
85678567
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
85688568
; GFX7-NEXT: v_readfirstlane_b32 s4, v2
@@ -8607,9 +8607,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
86078607
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
86088608
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
86098609
; GFX6-NEXT: s_cbranch_execnz .LBB29_2
8610-
; GFX6-NEXT: ; %bb.3: ; %Flow21
8610+
; GFX6-NEXT: ; %bb.3: ; %Flow20
86118611
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
8612-
; GFX6-NEXT: .LBB29_4: ; %Flow22
8612+
; GFX6-NEXT: .LBB29_4: ; %Flow21
86138613
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
86148614
; GFX6-NEXT: s_mov_b64 s[8:9], exec
86158615
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
@@ -8637,7 +8637,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
86378637
; GFX6-NEXT: v_mov_b32_e32 v3, v4
86388638
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
86398639
; GFX6-NEXT: s_cbranch_execnz .LBB29_6
8640-
; GFX6-NEXT: .LBB29_7: ; %Flow20
8640+
; GFX6-NEXT: .LBB29_7: ; %Flow19
86418641
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
86428642
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
86438643
; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
@@ -8686,7 +8686,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
86868686
; GFX6-NEXT: s_cbranch_execnz .LBB29_11
86878687
; GFX6-NEXT: ; %bb.12: ; %Flow
86888688
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
8689-
; GFX6-NEXT: .LBB29_13: ; %Flow18
8689+
; GFX6-NEXT: .LBB29_13: ; %Flow17
86908690
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
86918691
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
86928692
; GFX6-NEXT: v_readfirstlane_b32 s4, v2

llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
3939
; GFX942-NEXT: s_cbranch_vccz .LBB0_1
4040
; GFX942-NEXT: ; %bb.3:
4141
; GFX942-NEXT: ; implicit-def: $sgpr3
42+
; GFX942-NEXT: ; implicit-def: $agpr0
4243
; GFX942-NEXT: .LBB0_4: ; %common.ret
4344
; GFX942-NEXT: s_endpgm
4445
;
@@ -79,6 +80,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
7980
; GFX908-NEXT: s_cbranch_vccz .LBB0_1
8081
; GFX908-NEXT: ; %bb.3:
8182
; GFX908-NEXT: ; implicit-def: $sgpr3
83+
; GFX908-NEXT: ; implicit-def: $agpr0
8284
; GFX908-NEXT: .LBB0_4: ; %common.ret
8385
; GFX908-NEXT: s_endpgm
8486
entry:

0 commit comments

Comments
 (0)