Skip to content

Commit 219d451

Browse files
committed
[Analysis][AArch64] Make fixed-width ordered reductions slightly more expensive
For tight loops like this: float r = 0; for (int i = 0; i < n; i++) { r += a[i]; } it's better not to vectorise at -O3 using fixed-width ordered reductions on AArch64 targets. Although the resulting number of instructions in the generated code ends up being comparable to not vectorising at all, there may be additional costs on some CPUs, for example perhaps the scheduling is worse. It makes sense to deter vectorisation in tight loops. Differential Revision: https://reviews.llvm.org/D108292
1 parent 13d8f00 commit 219d451

File tree

3 files changed

+15
-10
lines changed

3 files changed

+15
-10
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1999,8 +1999,13 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
19991999
Optional<FastMathFlags> FMF,
20002000
TTI::TargetCostKind CostKind) {
20012001
if (TTI::requiresOrderedReduction(FMF)) {
2002-
if (!isa<ScalableVectorType>(ValTy))
2003-
return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2002+
if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
2003+
InstructionCost BaseCost =
2004+
BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2005+
// Add on extra cost to reflect the extra overhead on some CPUs. We still
2006+
// end up vectorizing for more computationally intensive loops.
2007+
return BaseCost + FixedVTy->getNumElements();
2008+
}
20042009

20052010
if (Opcode != Instruction::FAdd)
20062011
return InstructionCost::getInvalid();

llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
define void @strict_fp_reductions() {
44
; CHECK-LABEL: strict_fp_reductions
5-
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
6-
; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
7-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
8-
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
5+
; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
6+
; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
7+
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
99
%fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
1010
%fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
1111
%fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)

llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66

77
target triple="aarch64-unknown-linux-gnu"
88

9-
; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction: %add = fadd float %0, %sum.07
10-
; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction: %add = fadd float %0, %sum.07
9+
; CHECK-VF4: Found an estimated cost of 21 for VF 4 For instruction: %add = fadd float %0, %sum.07
10+
; CHECK-VF8: Found an estimated cost of 42 for VF 8 For instruction: %add = fadd float %0, %sum.07
1111

1212
define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) {
1313
entry:
@@ -28,8 +28,8 @@ for.end:
2828
}
2929

3030

31-
; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction: %add = fadd double %0, %sum.07
32-
; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction: %add = fadd double %0, %sum.07
31+
; CHECK-VF4: Found an estimated cost of 18 for VF 4 For instruction: %add = fadd double %0, %sum.07
32+
; CHECK-VF8: Found an estimated cost of 36 for VF 8 For instruction: %add = fadd double %0, %sum.07
3333

3434
define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) {
3535
entry:

0 commit comments

Comments
 (0)