Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 262bc11

Browse files
committed
[LV] Avoid rounding errors for predicated instruction costs
This patch modifies the cost calculation of predicated instructions (div and rem) to avoid the accumulation of rounding errors due to multiple truncating integer divisions. The calculation for predicated stores will be addressed in a follow-on patch since we currently don't scale the cost of predicated stores by block probability. Differential Revision: https://reviews.llvm.org/D25333 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@284123 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 97ca021 commit 262bc11

File tree

2 files changed

+82
-26
lines changed

2 files changed

+82
-26
lines changed

lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3542,10 +3542,7 @@ static Value *addFastMathFlag(Value *V) {
35423542
/// \brief Estimate the overhead of scalarizing a value based on its type.
35433543
/// Insert and Extract are set if the result needs to be inserted and/or
35443544
/// extracted from vectors.
3545-
/// If the instruction is also to be predicated, add the cost of a PHI
3546-
/// node to the insertion cost.
35473545
static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
3548-
bool Predicated,
35493546
const TargetTransformInfo &TTI) {
35503547
if (Ty->isVoidTy())
35513548
return 0;
@@ -3556,41 +3553,30 @@ static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
35563553
for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {
35573554
if (Extract)
35583555
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);
3559-
if (Insert) {
3556+
if (Insert)
35603557
Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
3561-
if (Predicated)
3562-
Cost += TTI.getCFInstrCost(Instruction::PHI);
3563-
}
35643558
}
35653559

3566-
// If we have a predicated instruction, it may not be executed for each
3567-
// vector lane. Scale the cost by the probability of executing the
3568-
// predicated block.
3569-
if (Predicated)
3570-
Cost /= getReciprocalPredBlockProb();
3571-
35723560
return Cost;
35733561
}
35743562

35753563
/// \brief Estimate the overhead of scalarizing an Instruction based on the
35763564
/// types of its operands and return value.
35773565
static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys,
3578-
Type *RetTy, bool Predicated,
3566+
Type *RetTy,
35793567
const TargetTransformInfo &TTI) {
35803568
unsigned ScalarizationCost =
3581-
getScalarizationOverhead(RetTy, true, false, Predicated, TTI);
3569+
getScalarizationOverhead(RetTy, true, false, TTI);
35823570

35833571
for (Type *Ty : OpTys)
3584-
ScalarizationCost +=
3585-
getScalarizationOverhead(Ty, false, true, Predicated, TTI);
3572+
ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
35863573

35873574
return ScalarizationCost;
35883575
}
35893576

35903577
/// \brief Estimate the overhead of scalarizing an instruction. This is a
35913578
/// convenience wrapper for the type-based getScalarizationOverhead API.
35923579
static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3593-
bool Predicated,
35943580
const TargetTransformInfo &TTI) {
35953581
if (VF == 1)
35963582
return 0;
@@ -3602,7 +3588,7 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
36023588
for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)
36033589
OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF));
36043590

3605-
return getScalarizationOverhead(OpTys, RetTy, Predicated, TTI);
3591+
return getScalarizationOverhead(OpTys, RetTy, TTI);
36063592
}
36073593

36083594
// Estimate cost of a call instruction CI if it were vectorized with factor VF.
@@ -3635,7 +3621,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
36353621

36363622
// Compute costs of unpacking argument values for the scalar calls and
36373623
// packing the return values to a vector.
3638-
unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, false, TTI);
3624+
unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI);
36393625

36403626
unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
36413627

@@ -6536,10 +6522,27 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
65366522
// vector lane. Get the scalarization cost and scale this amount by the
65376523
// probability of executing the predicated block. If the instruction is not
65386524
// predicated, we fall through to the next case.
6539-
if (VF > 1 && Legal->isScalarWithPredication(I))
6540-
return VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy) /
6541-
getReciprocalPredBlockProb() +
6542-
getScalarizationOverhead(I, VF, true, TTI);
6525+
if (VF > 1 && Legal->isScalarWithPredication(I)) {
6526+
unsigned Cost = 0;
6527+
6528+
// These instructions have a non-void type, so account for the phi nodes
6529+
// that we will create. This cost is likely to be zero. The phi node
6530+
// cost, if any, should be scaled by the block probability because it
6531+
// models a copy at the end of each predicated block.
6532+
Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6533+
6534+
// The cost of the non-predicated instruction.
6535+
Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6536+
6537+
// The cost of insertelement and extractelement instructions needed for
6538+
// scalarization.
6539+
Cost += getScalarizationOverhead(I, VF, TTI);
6540+
6541+
// Scale the cost by the probability of executing the predicated blocks.
6542+
// This assumes the predicated block for each vector lane is equally
6543+
// likely.
6544+
return Cost / getReciprocalPredBlockProb();
6545+
}
65436546
case Instruction::Add:
65446547
case Instruction::FAdd:
65456548
case Instruction::Sub:
@@ -6695,7 +6698,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
66956698

66966699
// Get the overhead of the extractelement and insertelement instructions
66976700
// we might create due to scalarization.
6698-
Cost += getScalarizationOverhead(I, VF, false, TTI);
6701+
Cost += getScalarizationOverhead(I, VF, TTI);
66996702

67006703
return Cost;
67016704
}
@@ -6782,7 +6785,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
67826785
// The cost of executing VF copies of the scalar instruction. This opcode
67836786
// is unknown. Assume that it is the same as 'mul'.
67846787
return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6785-
getScalarizationOverhead(I, VF, false, TTI);
6788+
getScalarizationOverhead(I, VF, TTI);
67866789
} // end of switch.
67876790
}
67886791

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
; REQUIRES: asserts
2+
; RUN: opt < %s -force-vector-width=2 -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
3+
4+
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
5+
target triple = "aarch64--linux-gnu"
6+
7+
; Check predication-related cost calculations, including scalarization overhead
8+
; and block probability scaling. Note that the functionality being tested is
9+
; not specific to AArch64. We specify a target to get actual values for the
10+
; instruction costs.
11+
12+
; CHECK-LABEL: predicated_udiv
13+
;
14+
; This test checks that we correctly compute the cost of the predicated udiv
15+
; instruction. If we assume the block probability is 50%, we compute the cost
16+
; as:
17+
;
18+
; Cost for vector lane zero:
19+
; (udiv(1) + 2 * extractelement(0) + insertelement(0)) / 2 = 0
20+
; Cost for vector lane one:
21+
; (udiv(1) + 2 * extractelement(3) + insertelement(3)) / 2 = 5
22+
;
23+
; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
24+
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
25+
;
26+
define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) {
27+
entry:
28+
br label %for.body
29+
30+
for.body:
31+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
32+
%r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
33+
%tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
34+
%tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
35+
%tmp2 = load i32, i32* %tmp0, align 4
36+
%tmp3 = load i32, i32* %tmp1, align 4
37+
br i1 %c, label %if.then, label %for.inc
38+
39+
if.then:
40+
%tmp4 = udiv i32 %tmp2, %tmp3
41+
br label %for.inc
42+
43+
for.inc:
44+
%tmp5 = phi i32 [ %tmp3, %for.body ], [ %tmp4, %if.then]
45+
%tmp6 = add i32 %r, %tmp5
46+
%i.next = add nuw nsw i64 %i, 1
47+
%cond = icmp slt i64 %i.next, %n
48+
br i1 %cond, label %for.body, label %for.end
49+
50+
for.end:
51+
%tmp7 = phi i32 [ %tmp6, %for.inc ]
52+
ret i32 %tmp7
53+
}

0 commit comments

Comments
 (0)