Skip to content

Commit df19d87

Browse files
committed
[LV] Add option to tune the cost model, NFC
For Neon, the default nonconst stride cost is conservative, and it is a local variable, which is not convenience to to tune the loop vectorize. So I try to use a option, which is similar to SVEGatherOverhead brought in D115143. Fix llvm#63082. Reviewed By: dmgreen, fhahn Differential Revision: https://reviews.llvm.org/D152253
1 parent e60b30d commit df19d87

File tree

2 files changed

+89
-1
lines changed

2 files changed

+89
-1
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
4242
static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
4343
cl::init(15), cl::Hidden);
4444

45+
static cl::opt<unsigned>
46+
NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
47+
cl::Hidden);
48+
4549
namespace {
4650
class TailFoldingOption {
4751
// These bitfields will only ever be set to something non-zero in operator=,
@@ -2577,7 +2581,7 @@ InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
25772581
// likely result in more instructions compared to scalar code where the
25782582
// computation can more often be merged into the index mode. The resulting
25792583
// extra micro-ops can significantly decrease throughput.
2580-
unsigned NumVectorInstToHideOverhead = 10;
2584+
unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
25812585
int MaxMergeDistance = 64;
25822586

25832587
if (Ty->isVectorTy() && SE &&
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone -enable-interleaved-mem-accesses=false -neon-nonconst-stride-overhead=5 < %s | FileCheck %s
2+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
3+
4+
@kernel = global [512 x float] zeroinitializer, align 16
5+
@kernel2 = global [512 x float] zeroinitializer, align 16
6+
@kernel3 = global [512 x float] zeroinitializer, align 16
7+
@kernel4 = global [512 x float] zeroinitializer, align 16
8+
@src_data = global [1536 x float] zeroinitializer, align 16
9+
@r_ = global i8 0, align 1
10+
@g_ = global i8 0, align 1
11+
@b_ = global i8 0, align 1
12+
13+
; vectorize loop when we lower the cost of gather load
14+
; Make sure we vectorize it, VF = 4
15+
; CHECK: <4 x float>
16+
17+
define void @_Z4testmm(i64 %size, i64 %offset) {
18+
entry:
19+
%cmp53 = icmp eq i64 %size, 0
20+
br i1 %cmp53, label %for.end, label %for.body.lr.ph
21+
22+
for.body.lr.ph:
23+
br label %for.body
24+
25+
for.body:
26+
%r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
27+
%g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
28+
%v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
29+
%b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
30+
%add = add i64 %v.055, %offset
31+
%mul = mul i64 %add, 3
32+
%arrayidx = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %mul
33+
%0 = load float, ptr %arrayidx, align 4
34+
%arrayidx2 = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 %v.055
35+
%1 = load float, ptr %arrayidx2, align 4
36+
%mul3 = fmul fast float %0, %1
37+
%arrayidx4 = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 %v.055
38+
%2 = load float, ptr %arrayidx4, align 4
39+
%mul5 = fmul fast float %mul3, %2
40+
%arrayidx6 = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 %v.055
41+
%3 = load float, ptr %arrayidx6, align 4
42+
%mul7 = fmul fast float %mul5, %3
43+
%arrayidx8 = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 %v.055
44+
%4 = load float, ptr %arrayidx8, align 4
45+
%mul9 = fmul fast float %mul7, %4
46+
%add10 = fadd fast float %r.057, %mul9
47+
%arrayidx.sum = add i64 %mul, 1
48+
%arrayidx11 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %arrayidx.sum
49+
%5 = load float, ptr %arrayidx11, align 4
50+
%mul13 = fmul fast float %1, %5
51+
%mul15 = fmul fast float %2, %mul13
52+
%mul17 = fmul fast float %3, %mul15
53+
%mul19 = fmul fast float %4, %mul17
54+
%add20 = fadd fast float %g.056, %mul19
55+
%arrayidx.sum52 = add i64 %mul, 2
56+
%arrayidx21 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %arrayidx.sum52
57+
%6 = load float, ptr %arrayidx21, align 4
58+
%mul23 = fmul fast float %1, %6
59+
%mul25 = fmul fast float %2, %mul23
60+
%mul27 = fmul fast float %3, %mul25
61+
%mul29 = fmul fast float %4, %mul27
62+
%add30 = fadd fast float %b.054, %mul29
63+
%inc = add i64 %v.055, 1
64+
%exitcond = icmp ne i64 %inc, %size
65+
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
66+
67+
for.cond.for.end_crit_edge:
68+
%add30.lcssa = phi float [ %add30, %for.body ]
69+
%add20.lcssa = phi float [ %add20, %for.body ]
70+
%add10.lcssa = phi float [ %add10, %for.body ]
71+
%phitmp = fptoui float %add10.lcssa to i8
72+
%phitmp60 = fptoui float %add20.lcssa to i8
73+
%phitmp61 = fptoui float %add30.lcssa to i8
74+
br label %for.end
75+
76+
for.end:
77+
%r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
78+
%g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
79+
%b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
80+
store i8 %r.0.lcssa, ptr @r_, align 1
81+
store i8 %g.0.lcssa, ptr @g_, align 1
82+
store i8 %b.0.lcssa, ptr @b_, align 1
83+
ret void
84+
}

0 commit comments

Comments
 (0)