Skip to content

Commit 405eae5

Browse files
committed
[LoopPeel] Peel if it turns invariant loads dereferenceable.
This patch adds a new cost heuristic that allows peeling a single iteration off read-only loops, if the loop contains a load that 1. is feeding an exit condition, 2. dominates the latch, 3. is not already known to be dereferenceable, 4. and has a loop invariant address. If all non-latch exits are terminated with unreachable, such loads in the loop are guaranteed to be dereferenceable after peeling, enabling hoisting/CSE'ing them. This enables vectorization of loops with certain runtime-checks, like multiple calls to `std::vector::at` if the vector is passed as pointer. Reviewed By: mkazantsev Differential Revision: https://reviews.llvm.org/D108114 (cherry-picked from cd0ba9dc58c5806f4e3cc9635ab1f64af6973a83)
1 parent 9d24a8e commit 405eae5

File tree

5 files changed

+333
-56
lines changed

5 files changed

+333
-56
lines changed

llvm/include/llvm/Transforms/Utils/LoopPeel.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
3232

3333
void computePeelCount(Loop *L, unsigned LoopSize,
3434
TargetTransformInfo::PeelingPreferences &PP,
35-
unsigned &TripCount, ScalarEvolution &SE,
36-
unsigned Threshold = UINT_MAX);
35+
unsigned &TripCount, DominatorTree &DT,
36+
ScalarEvolution &SE, unsigned Threshold = UINT_MAX);
3737

3838
} // end namespace llvm
3939

llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -877,7 +877,7 @@ bool llvm::computeUnrollCount(
877877
}
878878

879879
// 4th priority is loop peeling.
880-
computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold);
880+
computePeelCount(L, LoopSize, PP, TripCount, DT, SE, UP.Threshold);
881881
if (PP.PeelCount) {
882882
UP.Runtime = false;
883883
UP.Count = 1;

llvm/lib/Transforms/Utils/LoopPeel.cpp

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "llvm/ADT/Optional.h"
1515
#include "llvm/ADT/SmallVector.h"
1616
#include "llvm/ADT/Statistic.h"
17+
#include "llvm/Analysis/Loads.h"
1718
#include "llvm/Analysis/LoopInfo.h"
1819
#include "llvm/Analysis/LoopIterator.h"
1920
#include "llvm/Analysis/ScalarEvolution.h"
@@ -165,6 +166,66 @@ static unsigned calculateIterationsToInvariance(
165166
return ToInvariance;
166167
}
167168

169+
// Try to find any invariant memory reads that will become dereferenceable in
170+
// the remainder loop after peeling. The load must also be used (transitively)
171+
// by an exit condition. Returns the number of iterations to peel off (at the
172+
// moment either 0 or 1).
173+
static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L,
174+
DominatorTree &DT) {
175+
// Skip loops with a single exiting block, because there should be no benefit
176+
// for the heuristic below.
177+
if (L.getExitingBlock())
178+
return 0;
179+
180+
// All non-latch exit blocks must have an UnreachableInst terminator.
181+
// Otherwise the heuristic below may not be profitable.
182+
SmallVector<BasicBlock *, 4> Exits;
183+
L.getUniqueNonLatchExitBlocks(Exits);
184+
if (any_of(Exits, [](const BasicBlock *BB) {
185+
return !isa<UnreachableInst>(BB->getTerminator());
186+
}))
187+
return 0;
188+
189+
// Now look for invariant loads that dominate the latch and are not known to
190+
// be dereferenceable. If there are such loads and no writes, they will become
191+
// dereferenceable in the loop if the first iteration is peeled off. Also
192+
// collect the set of instructions controlled by such loads. Only peel if an
193+
// exit condition uses (transitively) such a load.
194+
BasicBlock *Header = L.getHeader();
195+
BasicBlock *Latch = L.getLoopLatch();
196+
SmallPtrSet<Value *, 8> LoadUsers;
197+
const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
198+
for (BasicBlock *BB : L.blocks()) {
199+
for (Instruction &I : *BB) {
200+
if (I.mayWriteToMemory())
201+
return 0;
202+
203+
auto Iter = LoadUsers.find(&I);
204+
if (Iter != LoadUsers.end()) {
205+
for (Value *U : I.users())
206+
LoadUsers.insert(U);
207+
}
208+
// Do not look for reads in the header; they can already be hoisted
209+
// without peeling.
210+
if (BB == Header)
211+
continue;
212+
if (auto *LI = dyn_cast<LoadInst>(&I)) {
213+
Value *Ptr = LI->getPointerOperand();
214+
if (DT.dominates(BB, Latch) && L.isLoopInvariant(Ptr) &&
215+
!isDereferenceablePointer(Ptr, LI->getType(), DL, LI, &DT))
216+
for (Value *U : I.users())
217+
LoadUsers.insert(U);
218+
}
219+
}
220+
}
221+
SmallVector<BasicBlock *> ExitingBlocks;
222+
L.getExitingBlocks(ExitingBlocks);
223+
for (BasicBlock *Exiting : ExitingBlocks)
224+
if (LoadUsers.find(Exiting->getTerminator()) != LoadUsers.end())
225+
return 1;
226+
return 0;
227+
}
228+
168229
// Return the number of iterations to peel off that make conditions in the
169230
// body true/false. For example, if we peel 2 iterations off the loop below,
170231
// the condition i < 2 can be evaluated at compile time.
@@ -280,8 +341,8 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
280341
// Return the number of iterations we want to peel off.
281342
void llvm::computePeelCount(Loop *L, unsigned LoopSize,
282343
TargetTransformInfo::PeelingPreferences &PP,
283-
unsigned &TripCount, ScalarEvolution &SE,
284-
unsigned Threshold) {
344+
unsigned &TripCount, DominatorTree &DT,
345+
ScalarEvolution &SE, unsigned Threshold) {
285346
assert(LoopSize > 0 && "Zero loop size is not allowed!");
286347
// Save the PP.PeelCount value set by the target in
287348
// TTI.getPeelingPreferences or by the flag -unroll-peel-count.
@@ -348,6 +409,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
348409
DesiredPeelCount = std::max(DesiredPeelCount,
349410
countToEliminateCompares(*L, MaxPeelCount, SE));
350411

412+
if (DesiredPeelCount == 0)
413+
DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT);
414+
351415
if (DesiredPeelCount > 0) {
352416
DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
353417
// Consider max peel count limitation.

llvm/test/Transforms/LoopUnroll/peel-to-turn-invariant-accesses-dereferenceable.ll

Lines changed: 112 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,99 @@ declare void @foo()
66
define i32 @peel_readonly_to_make_loads_derefenceable(i32* %ptr, i32 %N, i32* %inv, i1 %c.1) {
77
; CHECK-LABEL: @peel_readonly_to_make_loads_derefenceable(
88
; CHECK-NEXT: entry:
9+
; CHECK-NEXT: br label [[LOOP_HEADER_PEEL_BEGIN:%.*]]
10+
; CHECK: loop.header.peel.begin:
11+
; CHECK-NEXT: br label [[LOOP_HEADER_PEEL:%.*]]
12+
; CHECK: loop.header.peel:
13+
; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN_PEEL:%.*]], label [[UNREACHABLE_EXIT:%.*]]
14+
; CHECK: then.peel:
15+
; CHECK-NEXT: [[I_PEEL:%.*]] = load i32, i32* [[INV:%.*]], align 4
16+
; CHECK-NEXT: [[C_2_PEEL:%.*]] = icmp ult i32 [[I_PEEL]], 2
17+
; CHECK-NEXT: br i1 [[C_2_PEEL]], label [[LOOP_LATCH_PEEL:%.*]], label [[UNREACHABLE_EXIT]]
18+
; CHECK: loop.latch.peel:
19+
; CHECK-NEXT: [[GEP_PEEL:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 1
20+
; CHECK-NEXT: [[LV_PEEL:%.*]] = load i32, i32* [[GEP_PEEL]], align 4
21+
; CHECK-NEXT: [[SUM_NEXT_PEEL:%.*]] = add i32 0, [[LV_PEEL]]
22+
; CHECK-NEXT: [[IV_NEXT_PEEL:%.*]] = add nuw nsw i32 1, 1
23+
; CHECK-NEXT: [[C_3_PEEL:%.*]] = icmp ult i32 1, 1000
24+
; CHECK-NEXT: br i1 [[C_3_PEEL]], label [[LOOP_HEADER_PEEL_NEXT:%.*]], label [[EXIT:%.*]]
25+
; CHECK: loop.header.peel.next:
26+
; CHECK-NEXT: br label [[LOOP_HEADER_PEEL_NEXT1:%.*]]
27+
; CHECK: loop.header.peel.next1:
28+
; CHECK-NEXT: br label [[ENTRY_PEEL_NEWPH:%.*]]
29+
; CHECK: entry.peel.newph:
30+
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
31+
; CHECK: loop.header:
32+
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT_PEEL]], [[ENTRY_PEEL_NEWPH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
33+
; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ [[SUM_NEXT_PEEL]], [[ENTRY_PEEL_NEWPH]] ], [ [[SUM_NEXT:%.*]], [[LOOP_LATCH]] ]
34+
; CHECK-NEXT: br i1 [[C_1]], label [[THEN:%.*]], label [[UNREACHABLE_EXIT_LOOPEXIT:%.*]]
35+
; CHECK: then:
36+
; CHECK-NEXT: [[I:%.*]] = load i32, i32* [[INV]], align 4
37+
; CHECK-NEXT: [[C_2:%.*]] = icmp ult i32 [[I]], 2
38+
; CHECK-NEXT: br i1 [[C_2]], label [[LOOP_LATCH]], label [[UNREACHABLE_EXIT_LOOPEXIT]]
39+
; CHECK: loop.latch:
40+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i32 [[IV]]
41+
; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP]], align 4
42+
; CHECK-NEXT: [[SUM_NEXT]] = add i32 [[SUM]], [[LV]]
43+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
44+
; CHECK-NEXT: [[C_3:%.*]] = icmp ult i32 [[IV]], 1000
45+
; CHECK-NEXT: br i1 [[C_3]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
46+
; CHECK: exit.loopexit:
47+
; CHECK-NEXT: [[SUM_NEXT_LCSSA_PH:%.*]] = phi i32 [ [[SUM_NEXT]], [[LOOP_LATCH]] ]
48+
; CHECK-NEXT: br label [[EXIT]]
49+
; CHECK: exit:
50+
; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT_PEEL]], [[LOOP_LATCH_PEEL]] ], [ [[SUM_NEXT_LCSSA_PH]], [[EXIT_LOOPEXIT]] ]
51+
; CHECK-NEXT: ret i32 [[SUM_NEXT_LCSSA]]
52+
; CHECK: unreachable.exit.loopexit:
53+
; CHECK-NEXT: br label [[UNREACHABLE_EXIT]]
54+
; CHECK: unreachable.exit:
55+
; CHECK-NEXT: call void @foo()
56+
; CHECK-NEXT: unreachable
57+
;
58+
entry:
59+
br label %loop.header
60+
61+
loop.header:
62+
%iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ]
63+
%sum = phi i32 [ 0, %entry ], [ %sum.next, %loop.latch ]
64+
br i1 %c.1, label %then, label %unreachable.exit
65+
66+
then:
67+
%i = load i32, i32* %inv
68+
%c.2 = icmp ult i32 %i, 2
69+
br i1 %c.2, label %loop.latch, label %unreachable.exit
70+
71+
loop.latch:
72+
%gep = getelementptr i32, i32* %ptr, i32 %iv
73+
%lv = load i32, i32* %gep
74+
%sum.next = add i32 %sum, %lv
75+
%iv.next = add nuw nsw i32 %iv, 1
76+
%c.3 = icmp ult i32 %iv, 1000
77+
br i1 %c.3, label %loop.header, label %exit
78+
79+
exit:
80+
ret i32 %sum.next
81+
82+
unreachable.exit:
83+
call void @foo()
84+
unreachable
85+
}
86+
87+
define i32 @peel_readonly_to_make_loads_derefenceable_exits_lead_to_unreachable(i32* %ptr, i32 %N, i32* %inv, i1 %c.1) {
88+
; CHECK-LABEL: @peel_readonly_to_make_loads_derefenceable_exits_lead_to_unreachable(
89+
; CHECK-NEXT: entry:
990
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
1091
; CHECK: loop.header:
1192
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
1293
; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[LOOP_LATCH]] ]
13-
; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[UNREACHABLE_EXIT:%.*]]
94+
; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT_2:%.*]]
1495
; CHECK: then:
1596
; CHECK-NEXT: [[I:%.*]] = load i32, i32* [[INV:%.*]], align 4
1697
; CHECK-NEXT: [[C_2:%.*]] = icmp ult i32 [[I]], 2
17-
; CHECK-NEXT: br i1 [[C_2]], label [[LOOP_LATCH]], label [[UNREACHABLE_EXIT]]
98+
; CHECK-NEXT: br i1 [[C_2]], label [[THEN_2:%.*]], label [[EXIT_2]]
99+
; CHECK: then.2:
100+
; CHECK-NEXT: [[C_4:%.*]] = icmp ult i32 [[I]], 4
101+
; CHECK-NEXT: br i1 [[C_4]], label [[LOOP_LATCH]], label [[EXIT_3:%.*]]
18102
; CHECK: loop.latch:
19103
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[IV]]
20104
; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP]], align 4
@@ -25,7 +109,11 @@ define i32 @peel_readonly_to_make_loads_derefenceable(i32* %ptr, i32 %N, i32* %i
25109
; CHECK: exit:
26110
; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[LOOP_LATCH]] ]
27111
; CHECK-NEXT: ret i32 [[SUM_NEXT_LCSSA]]
28-
; CHECK: unreachable.exit:
112+
; CHECK: exit.2:
113+
; CHECK-NEXT: br label [[UNREACHABLE_BB:%.*]]
114+
; CHECK: exit.3:
115+
; CHECK-NEXT: br label [[UNREACHABLE_BB]]
116+
; CHECK: unreachable.bb:
29117
; CHECK-NEXT: call void @foo()
30118
; CHECK-NEXT: unreachable
31119
;
@@ -35,12 +123,16 @@ entry:
35123
loop.header:
36124
%iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ]
37125
%sum = phi i32 [ 0, %entry ], [ %sum.next, %loop.latch ]
38-
br i1 %c.1, label %then, label %unreachable.exit
126+
br i1 %c.1, label %then, label %exit.2
39127

40128
then:
41129
%i = load i32, i32* %inv
42130
%c.2 = icmp ult i32 %i, 2
43-
br i1 %c.2, label %loop.latch, label %unreachable.exit
131+
br i1 %c.2, label %then.2, label %exit.2
132+
133+
then.2:
134+
%c.4 = icmp ult i32 %i, 4
135+
br i1 %c.4, label %loop.latch, label %exit.3
44136

45137
loop.latch:
46138
%gep = getelementptr i32, i32* %ptr, i32 %iv
@@ -53,7 +145,13 @@ loop.latch:
53145
exit:
54146
ret i32 %sum.next
55147

56-
unreachable.exit:
148+
exit.2:
149+
br label %unreachable.bb
150+
151+
exit.3:
152+
br label %unreachable.bb
153+
154+
unreachable.bb:
57155
call void @foo()
58156
unreachable
59157
}
@@ -302,18 +400,18 @@ unreachable.exit:
302400

303401
declare i32 @llvm.experimental.deoptimize.i32(...)
304402

305-
define i32 @do_not_peel_with_deopt_exit(i32* %ptr, i32 %N, i32* %inv, i1 %c.1) {
306-
; CHECK-LABEL: @do_not_peel_with_deopt_exit(
403+
define i32 @peel_with_deopt_exit(i32* %ptr, i32 %N, i32* %inv, i1 %c.1) {
404+
; CHECK-LABEL: @peel_with_deopt_exit(
307405
; CHECK-NEXT: entry:
308406
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
309407
; CHECK: loop.header:
310408
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
311409
; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[LOOP_LATCH]] ]
312-
; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[UNREACHABLE_EXIT:%.*]]
410+
; CHECK-NEXT: br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[DEOPT_EXIT:%.*]]
313411
; CHECK: then:
314412
; CHECK-NEXT: [[I:%.*]] = load i32, i32* [[INV:%.*]], align 4
315413
; CHECK-NEXT: [[C_2:%.*]] = icmp ult i32 [[I]], 2
316-
; CHECK-NEXT: br i1 [[C_2]], label [[LOOP_LATCH]], label [[UNREACHABLE_EXIT]]
414+
; CHECK-NEXT: br i1 [[C_2]], label [[LOOP_LATCH]], label [[DEOPT_EXIT]]
317415
; CHECK: loop.latch:
318416
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[IV]]
319417
; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP]], align 4
@@ -324,7 +422,7 @@ define i32 @do_not_peel_with_deopt_exit(i32* %ptr, i32 %N, i32* %inv, i1 %c.1) {
324422
; CHECK: exit:
325423
; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[LOOP_LATCH]] ]
326424
; CHECK-NEXT: ret i32 [[SUM_NEXT_LCSSA]]
327-
; CHECK: unreachable.exit:
425+
; CHECK: deopt.exit:
328426
; CHECK-NEXT: [[SUM_LCSSA:%.*]] = phi i32 [ [[SUM]], [[THEN]] ], [ [[SUM]], [[LOOP_HEADER]] ]
329427
; CHECK-NEXT: [[RVAL:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 [[SUM_LCSSA]]) ]
330428
; CHECK-NEXT: ret i32 [[RVAL]]
@@ -335,12 +433,12 @@ entry:
335433
loop.header:
336434
%iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ]
337435
%sum = phi i32 [ 0, %entry ], [ %sum.next, %loop.latch ]
338-
br i1 %c.1, label %then, label %unreachable.exit
436+
br i1 %c.1, label %then, label %deopt.exit
339437

340438
then:
341439
%i = load i32, i32* %inv
342440
%c.2 = icmp ult i32 %i, 2
343-
br i1 %c.2, label %loop.latch, label %unreachable.exit
441+
br i1 %c.2, label %loop.latch, label %deopt.exit
344442

345443
loop.latch:
346444
%gep = getelementptr i32, i32* %ptr, i32 %iv
@@ -353,7 +451,7 @@ loop.latch:
353451
exit:
354452
ret i32 %sum.next
355453

356-
unreachable.exit:
454+
deopt.exit:
357455
%rval = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %sum) ]
358456
ret i32 %rval
359457
}

0 commit comments

Comments
 (0)