Skip to content

Commit 54711a6

Browse files
committed
[LoopSink] Allow sinking to PHI-use
This change allows sinking defs from loop preheader with PHI-use into loop body. Loop sink can now see through PHI-use and select incoming blocks of value being used as candidate sink destination. It makes loop sink more effective so more LICM can be undone if proven unprofitable with profile info. It addresses the motivating case in D87551, without resorting to profile guided LICM which breaks canonicalization. Differential Revision: https://reviews.llvm.org/D152772
1 parent a9e1d2e commit 54711a6

File tree

2 files changed

+133
-4
lines changed

2 files changed

+133
-4
lines changed

llvm/lib/Transforms/Scalar/LoopSink.cpp

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,13 +177,27 @@ static bool sinkInstruction(
177177
SmallPtrSet<BasicBlock *, 2> BBs;
178178
for (auto &U : I.uses()) {
179179
Instruction *UI = cast<Instruction>(U.getUser());
180-
// We cannot sink I to PHI-uses.
181-
if (isa<PHINode>(UI))
182-
return false;
180+
183181
// We cannot sink I if it has uses outside of the loop.
184182
if (!L.contains(LI.getLoopFor(UI->getParent())))
185183
return false;
186-
BBs.insert(UI->getParent());
184+
185+
if (!isa<PHINode>(UI)) {
186+
BBs.insert(UI->getParent());
187+
continue;
188+
}
189+
190+
// We cannot sink I to PHI-uses, try to look through PHI to find the incoming
191+
// block of the value being used.
192+
PHINode *PN = dyn_cast<PHINode>(UI);
193+
BasicBlock *PhiBB = PN->getIncomingBlock(U);
194+
195+
// If value's incoming block is from loop preheader directly, there's no
196+
// place to sink to, bailout.
197+
if (L.getLoopPreheader() == PhiBB)
198+
return false;
199+
200+
BBs.insert(PhiBB);
187201
}
188202

189203
// findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
2+
; RUN: opt -S -verify-memoryssa -passes=loop-sink < %s | FileCheck %s
3+
; Make sure that unprofitable loop ICM can be undone by loop sink, and loop sink can handle
4+
; sinking through PHI use.
5+
6+
7+
define dso_local i32 @_Z3fooii(i32 %arg, i32 %arg1, i32 %arg2) local_unnamed_addr #0 !prof !29 {
8+
; CHECK-LABEL: define dso_local i32 @_Z3fooii
9+
; CHECK-SAME: (i32 [[ARG:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !prof [[PROF29:![0-9]+]] {
10+
; CHECK-NEXT: .l.check.preheader:
11+
; CHECK-NEXT: br label [[DOTL_CHECK:%.*]]
12+
; CHECK: .l.ret.loopexit:
13+
; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[I10:%.*]], [[DOTL_ITERATE:%.*]] ]
14+
; CHECK-NEXT: ret i32 [[DOTLCSSA]]
15+
; CHECK: .l.check:
16+
; CHECK-NEXT: [[I4:%.*]] = phi i32 [ 0, [[DOTL_CHECK_PREHEADER:%.*]] ], [ [[I11:%.*]], [[DOTL_ITERATE]] ]
17+
; CHECK-NEXT: [[I5:%.*]] = phi i32 [ [[ARG]], [[DOTL_CHECK_PREHEADER]] ], [ [[I10]], [[DOTL_ITERATE]] ]
18+
; CHECK-NEXT: [[I6:%.*]] = icmp eq i32 [[I4]], [[ARG1]]
19+
; CHECK-NEXT: br i1 [[I6]], label [[DOTL_COLD:%.*]], label [[DOTL_ITERATE]], !prof [[PROF30:![0-9]+]]
20+
; CHECK: .l.cold:
21+
; CHECK-NEXT: [[FLAG:%.*]] = icmp eq i32 [[ARG1]], 5
22+
; CHECK-NEXT: br i1 [[FLAG]], label [[DOTL_COLD1:%.*]], label [[DOTL_COLD2:%.*]]
23+
; CHECK: .l.cold1:
24+
; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[ARG2]], [[ARG2]]
25+
; CHECK-NEXT: br label [[DOTL_COLD3:%.*]]
26+
; CHECK: .l.cold2:
27+
; CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[ARG2]], [[ARG2]]
28+
; CHECK-NEXT: br label [[DOTL_COLD3]]
29+
; CHECK: .l.cold3:
30+
; CHECK-NEXT: [[I7:%.*]] = phi i32 [ [[TMP1]], [[DOTL_COLD1]] ], [ [[TMP2]], [[DOTL_COLD2]] ]
31+
; CHECK-NEXT: [[I8:%.*]] = tail call i32 @_Z3bari(i32 [[I5]])
32+
; CHECK-NEXT: [[I9:%.*]] = add nsw i32 [[I8]], [[I7]]
33+
; CHECK-NEXT: br label [[DOTL_ITERATE]]
34+
; CHECK: .l.iterate:
35+
; CHECK-NEXT: [[I10]] = phi i32 [ [[I9]], [[DOTL_COLD3]] ], [ [[I5]], [[DOTL_CHECK]] ]
36+
; CHECK-NEXT: [[I11]] = add nuw nsw i32 [[I4]], 1
37+
; CHECK-NEXT: [[I12:%.*]] = icmp eq i32 [[I11]], [[I10]]
38+
; CHECK-NEXT: br i1 [[I12]], label [[DOTL_RET_LOOPEXIT:%.*]], label [[DOTL_CHECK]]
39+
;
40+
41+
.l.check.preheader:
42+
%flag = icmp eq i32 %arg1, 5
43+
%tmp2 = add nsw i32 %arg2, %arg2
44+
%tmp1 = mul nsw i32 %arg2, %arg2
45+
br label %.l.check
46+
47+
.l.ret.loopexit: ; preds = %.l.iterate
48+
%.lcssa = phi i32 [ %i10, %.l.iterate ]
49+
ret i32 %.lcssa
50+
51+
.l.check: ; preds = %.l.iterate, %.l.check.preheader
52+
%i4 = phi i32 [ 0, %.l.check.preheader ], [ %i11, %.l.iterate ]
53+
%i5 = phi i32 [ %arg, %.l.check.preheader ], [ %i10, %.l.iterate ]
54+
%i6 = icmp eq i32 %i4, %arg1
55+
br i1 %i6, label %.l.cold, label %.l.iterate, !prof !30
56+
57+
.l.cold: ; preds = %.l.check
58+
br i1 %flag, label %.l.cold1, label %.l.cold2
59+
60+
.l.cold1: ; preds = %.l.cold
61+
br label %.l.cold3
62+
63+
.l.cold2: ; preds = %.l.cold
64+
br label %.l.cold3
65+
66+
.l.cold3: ; preds = %.l.cold2, %.l.cold1
67+
%i7 = phi i32 [ %tmp1, %.l.cold1 ], [ %tmp2, %.l.cold2 ]
68+
%i8 = tail call i32 @_Z3bari(i32 %i5)
69+
%i9 = add nsw i32 %i8, %i7
70+
br label %.l.iterate
71+
72+
.l.iterate: ; preds = %.l.cold3, %.l.check
73+
%i10 = phi i32 [ %i9, %.l.cold3 ], [ %i5, %.l.check ]
74+
%i11 = add nuw nsw i32 %i4, 1
75+
%i12 = icmp eq i32 %i11, %i10
76+
br i1 %i12, label %.l.ret.loopexit, label %.l.check
77+
}
78+
79+
declare dso_local i32 @_Z3bari(i32) local_unnamed_addr
80+
81+
attributes #0 = { "use-sample-profile" }
82+
83+
!llvm.module.flags = !{!0}
84+
85+
!0 = !{i32 1, !"ProfileSummary", !1}
86+
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11}
87+
!2 = !{!"ProfileFormat", !"SampleProfile"}
88+
!3 = !{!"TotalCount", i64 403}
89+
!4 = !{!"MaxCount", i64 200}
90+
!5 = !{!"MaxInternalCount", i64 0}
91+
!6 = !{!"MaxFunctionCount", i64 1}
92+
!7 = !{!"NumCounts", i64 6}
93+
!8 = !{!"NumFunctions", i64 1}
94+
!9 = !{!"IsPartialProfile", i64 0}
95+
!10 = !{!"PartialProfileRatio", double 0.000000e+00}
96+
!11 = !{!"DetailedSummary", !12}
97+
!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28}
98+
!13 = !{i32 10000, i64 200, i32 2}
99+
!14 = !{i32 100000, i64 200, i32 2}
100+
!15 = !{i32 200000, i64 200, i32 2}
101+
!16 = !{i32 300000, i64 200, i32 2}
102+
!17 = !{i32 400000, i64 200, i32 2}
103+
!18 = !{i32 500000, i64 200, i32 2}
104+
!19 = !{i32 600000, i64 200, i32 2}
105+
!20 = !{i32 700000, i64 200, i32 2}
106+
!21 = !{i32 800000, i64 200, i32 2}
107+
!22 = !{i32 900000, i64 200, i32 2}
108+
!23 = !{i32 950000, i64 200, i32 2}
109+
!24 = !{i32 990000, i64 200, i32 2}
110+
!25 = !{i32 999000, i64 1, i32 5}
111+
!26 = !{i32 999900, i64 1, i32 5}
112+
!27 = !{i32 999990, i64 1, i32 5}
113+
!28 = !{i32 999999, i64 1, i32 5}
114+
!29 = !{!"function_entry_count", i64 2}
115+
!30 = !{!"branch_weights", i32 1, i32 201}

0 commit comments

Comments
 (0)