Skip to content

Commit fbb939a

Browse files
committed
Addressing suggestions
* Fixing comments * Adding more tests * Remove cmp latch presence requirements
1 parent 3a4555b commit fbb939a

File tree

2 files changed

+114
-41
lines changed

2 files changed

+114
-41
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2648,29 +2648,21 @@ static Value *getExpandedStep(const InductionDescriptor &ID,
26482648
return I->second;
26492649
}
26502650

2651-
/// Knowing that loop \p L would be fully unrolled after vectorisation, add
2652-
/// instructions that will get simplified and thus should not have any cost to
2653-
/// \p InstsToIgnore
2654-
static void AddFullyUnrolledInstructionsToIgnore(
2651+
/// Knowing that loop \p L executes a single vector iteration, add instructions
2652+
/// that will get simplified and thus should not have any cost to \p
2653+
/// InstsToIgnore.
2654+
static void addFullyUnrolledInstructionsToIgnore(
26552655
Loop *L, const LoopVectorizationLegality::InductionList &IL,
26562656
SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
26572657
auto *Cmp = L->getLatchCmpInst();
2658-
if (!Cmp)
2659-
return;
2660-
InstsToIgnore.insert(Cmp);
2658+
if (Cmp)
2659+
InstsToIgnore.insert(Cmp);
26612660
for (const auto &[IV, IndDesc] : IL) {
2662-
// Get next iteration value of the induction variable
2661+
// Get next iteration value of the induction variable.
26632662
Instruction *IVInst =
26642663
cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2665-
bool IsSimplifiedAway = true;
2666-
// Check that this value used only to exit the loop
2667-
for (auto *UIV : IVInst->users()) {
2668-
if (UIV != IV && UIV != Cmp) {
2669-
IsSimplifiedAway = false;
2670-
break;
2671-
}
2672-
}
2673-
if (IsSimplifiedAway)
2664+
if (all_of(IVInst->users(),
2665+
[&](const User *U) { return U == IV || U == Cmp; }))
26742666
InstsToIgnore.insert(IVInst);
26752667
}
26762668
}
@@ -5561,12 +5553,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
55615553
InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
55625554
InstructionCost Cost;
55635555

5564-
// If with the given fixed width VF loop gets fully unrolled, ignore the costs
5565-
// of comparison and induction instructions, as they'll get simplified away
5556+
// If the vector loop gets executed exactly once with the given VF, ignore the
5557+
// costs of comparison and induction instructions, as they'll get simplified
5558+
// away.
55665559
SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
55675560
auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
55685561
if (VF.isFixed() && TC == VF.getFixedValue())
5569-
AddFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
5562+
addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
55705563
ValuesToIgnoreForVF);
55715564

55725565
// For each block.
@@ -7259,11 +7252,14 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
72597252
IVInsts.push_back(CI);
72607253
}
72617254

7262-
// If with the given VF loop gets fully unrolled, ignore the costs of
7263-
// comparison and induction instructions, as they'll get simplified away
7255+
// If the vector loop gets executed exactly once with the given VF, ignore
7256+
// the costs of comparison and induction instructions, as they'll get
7257+
// simplified away.
7258+
// TODO: Remove this code after stepping away from the legacy cost model and
7259+
// adding code to simplify VPlans before calculating their costs.
72647260
auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
72657261
if (VF.isFixed() && TC == VF.getFixedValue())
7266-
AddFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
7262+
addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
72677263
CostCtx.SkipCostComputation);
72687264

72697265
for (Instruction *IVInst : IVInsts) {
Lines changed: 95 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,115 @@
11
; REQUIRES: asserts
2-
; RUN: opt < %s -mcpu=neoverse-v2 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S 2>&1 | FileCheck %s
2+
; RUN: opt < %s -mcpu=neoverse-v2 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
33

44
target triple="aarch64--linux-gnu"
55

6+
; This test shows that comparison and next iteration IV have zero cost if the
7+
; vector loop gets executed exactly once with the given VF.
68
define i64 @test(ptr %a, ptr %b) #0 {
79
; CHECK: LV: Checking a loop in 'test'
8-
; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
9-
; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %exitcond.not = icmp eq i64 %indvars.iv.next, 16
10-
; CHECK: LV: Vector loop of width 8 costs: 3.
11-
; CHECK-NOT: LV: Found an estimated cost of 1 for VF 16 For instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
12-
; CHECK-NOT: LV: Found an estimated cost of 1 for VF 16 For instruction: %exitcond.not = icmp eq i64 %indvars.iv.next, 16
13-
; CHECK: LV: Vector loop of width 16 costs: 3.
10+
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
11+
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
12+
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
13+
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
14+
; CHECK: Cost for VF 8: 26
15+
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
16+
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
17+
; CHECK: Cost for VF 16: 48
1418
; CHECK: LV: Selecting VF: 16
1519
entry:
1620
br label %for.body
1721

18-
for.cond.cleanup: ; preds = %for.body
19-
%add.lcssa = phi i64 [ %add, %for.body ]
20-
ret i64 %add.lcssa
22+
exit: ; preds = %for.body
23+
ret i64 %add
2124

2225
for.body: ; preds = %entry, %for.body
23-
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
24-
%sum.09 = phi i64 [ 0, %entry ], [ %add, %for.body ]
25-
%arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
26+
%i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
27+
%sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
28+
%arrayidx = getelementptr inbounds i8, ptr %a, i64 %i.iv
2629
%0 = load i8, ptr %arrayidx, align 1
2730
%conv = zext i8 %0 to i64
28-
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
31+
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv
2932
%1 = load i8, ptr %arrayidx2, align 1
3033
%conv3 = zext i8 %1 to i64
3134
%mul = mul nuw nsw i64 %conv3, %conv
32-
%add = add i64 %mul, %sum.09
33-
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
34-
%exitcond.not = icmp eq i64 %indvars.iv.next, 16
35-
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
35+
%add = add i64 %mul, %sum
36+
%i.iv.next = add nuw nsw i64 %i.iv, 1
37+
%exitcond.not = icmp eq i64 %i.iv.next, 16
38+
br i1 %exitcond.not, label %exit, label %for.body
39+
}
40+
41+
; Same as above, but in the next iteration IV has extra users, and thus, the cost is not zero.
42+
define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
43+
; CHECK: LV: Checking a loop in 'test_external_iv_user'
44+
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
45+
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
46+
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
47+
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
48+
; CHECK: Cost for VF 8: 26
49+
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
50+
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
51+
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
52+
; CHECK: Cost for VF 16: 49
53+
; CHECK: LV: Selecting VF: vscale x 2
54+
entry:
55+
br label %for.body
56+
57+
for.body: ; preds = %entry, %for.body
58+
%i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
59+
%sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
60+
%arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv
61+
%0 = load i8, ptr %arrayidx, align 1
62+
%conv = zext i8 %0 to i64
63+
%i.iv.next = add nuw nsw i64 %i.iv, 1
64+
%arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
65+
%1 = load i8, ptr %arrayidx2, align 1
66+
%conv3 = zext i8 %1 to i64
67+
%mul = mul nuw nsw i64 %conv3, %conv
68+
%add = add i64 %sum, %mul
69+
%exitcond.not = icmp eq i64 %i.iv.next, 16
70+
br i1 %exitcond.not, label %exit, label %for.body
71+
72+
exit: ; preds = %for.body
73+
ret i64 %add
74+
}
75+
76+
; Same as above but with two IVs without extra users. They all have zero cost when VF equals the number of iterations.
77+
define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
78+
; CHECK: LV: Checking a loop in 'test_two_ivs'
79+
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
80+
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
81+
; CHECK-NEXT: Cost of 1 for VF 8: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1
82+
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
83+
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
84+
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
85+
; CHECK: Cost for VF 8: 27
86+
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
87+
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
88+
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
89+
; CHECK: Cost for VF 16: 48
90+
; CHECK: LV: Selecting VF: 16
91+
entry:
92+
br label %for.body
93+
94+
exit: ; preds = %for.body
95+
ret i64 %add
96+
97+
for.body: ; preds = %entry, %for.body
98+
%i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
99+
%j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
100+
%sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
101+
%arrayidx = getelementptr inbounds i8, ptr %a, i64 %i.iv
102+
%0 = load i8, ptr %arrayidx, align 1
103+
%conv = zext i8 %0 to i64
104+
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %j.iv
105+
%1 = load i8, ptr %arrayidx2, align 1
106+
%conv3 = zext i8 %1 to i64
107+
%mul = mul nuw nsw i64 %conv3, %conv
108+
%add = add i64 %mul, %sum
109+
%i.iv.next = add nuw nsw i64 %i.iv, 1
110+
%j.iv.next = add nuw nsw i64 %j.iv, 1
111+
%exitcond.not = icmp eq i64 %i.iv.next, 16
112+
br i1 %exitcond.not, label %exit, label %for.body
36113
}
37114

38115
attributes #0 = { vscale_range(1, 16) "target-features"="+sve" }

0 commit comments

Comments
 (0)