Skip to content

Commit 58b8a84

Browse files
committed
[LV][AArch64] Prefer Fixed over Scalable if cost-model is equal (Neoverse V2)
For the Neoverse V2, prefer fixed width vectorisation If the cost-model assigns an equal cost to fixed and scalable vectorisation. This improves 7 kernels from TSVC-2 by about 2x, and does not affect SPEC21017 INT and FP. This tends to benefit small kernels, like the ones in TSVC, for a number of reasons: processing the predicates does not come entirely for free, NEON tends to generate slightly less code which can have a big impact on these small kernels, and then there are second order affects that SVE codegen is slightly less optimal in some areas. This codegen strategy to generate more NEON is inline with GCC's codegen strategy, which is actually even more aggressive in generating NEON when no predication is required. We could be smarter and more aggressive too about generating more NEON (and improve performance), but this seems to be a first good and straight forward step. This depends on #95818.
1 parent 35ddc17 commit 58b8a84

File tree

2 files changed

+200
-1
lines changed

2 files changed

+200
-1
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4780,7 +4780,10 @@ bool LoopVectorizationPlanner::isMoreProfitable(
47804780
// Assume vscale may be larger than 1 (or the value being tuned for),
47814781
// so that scalable vectorization is slightly favorable over fixed-width
47824782
// vectorization.
4783-
bool PreferScalable = A.Width.isScalable() && !B.Width.isScalable();
4783+
bool PreferScalable = false;
4784+
if (!TTI.preferFixedIfEqualToScalable())
4785+
PreferScalable = A.Width.isScalable() && !B.Width.isScalable();
4786+
47844787
auto CmpFn = [PreferScalable](const InstructionCost &LHS,
47854788
const InstructionCost &RHS) {
47864789
return PreferScalable ? LHS <= RHS : LHS < RHS;
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S < %s -passes=loop-vectorize | FileCheck %s
3+
4+
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
5+
target triple = "aarch64-unknown-linux-gnu"
6+
7+
@a = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
8+
@b = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
9+
10+
define void @NeoverseV2() local_unnamed_addr #0 {
11+
; CHECK-LABEL: define void @NeoverseV2(
12+
; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
13+
; CHECK-NEXT: [[ENTRY:.*]]:
14+
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
15+
; CHECK: [[VECTOR_PH]]:
16+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
17+
; CHECK: [[VECTOR_BODY]]:
18+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
19+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
20+
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
21+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP0]]
22+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP1]]
23+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
24+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
25+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
26+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
27+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[TMP0]]
28+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[TMP1]]
29+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
30+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 4
31+
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
32+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
33+
; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD]]
34+
; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD1]]
35+
; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP0]], 16000
36+
; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i64 [[TMP1]], 16000
37+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP12]]
38+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP13]]
39+
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
40+
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 4
41+
; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[TMP16]], align 4
42+
; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[TMP17]], align 4
43+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
44+
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16000
45+
; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
46+
; CHECK: [[MIDDLE_BLOCK]]:
47+
; CHECK-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
48+
; CHECK: [[SCALAR_PH]]:
49+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
50+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
51+
; CHECK: [[FOR_COND_CLEANUP]]:
52+
; CHECK-NEXT: ret void
53+
; CHECK: [[FOR_BODY]]:
54+
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
55+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
56+
; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4
57+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[INDVARS_IV]]
58+
; CHECK-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
59+
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP20]], [[TMP19]]
60+
; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 16000
61+
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP21]]
62+
; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX5]], align 4
63+
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
64+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16000
65+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
66+
;
67+
entry:
68+
br label %for.body
69+
70+
for.cond.cleanup:
71+
ret void
72+
73+
for.body:
74+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
75+
%arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
76+
%0 = load float, ptr %arrayidx, align 4
77+
%arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
78+
%1 = load float, ptr %arrayidx2, align 4
79+
%add = fadd fast float %1, %0
80+
%2 = add nuw nsw i64 %indvars.iv, 16000
81+
%arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
82+
store float %add, ptr %arrayidx5, align 4
83+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
84+
%exitcond.not = icmp eq i64 %indvars.iv.next, 16000
85+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
86+
}
87+
88+
define void @NeoverseV1() #1 {
89+
; CHECK-LABEL: define void @NeoverseV1(
90+
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
91+
; CHECK-NEXT: [[ENTRY:.*]]:
92+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
93+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
94+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16000, [[TMP1]]
95+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
96+
; CHECK: [[VECTOR_PH]]:
97+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
98+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
99+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 16000, [[TMP3]]
100+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 16000, [[N_MOD_VF]]
101+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
102+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
103+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
104+
; CHECK: [[VECTOR_BODY]]:
105+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
106+
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
107+
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
108+
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
109+
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0
110+
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1
111+
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
112+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP6]]
113+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP11]]
114+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
115+
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
116+
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
117+
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP16]]
118+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
119+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
120+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[TMP6]]
121+
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[TMP11]]
122+
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i32 0
123+
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
124+
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
125+
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP22]]
126+
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP20]], align 4
127+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP23]], align 4
128+
; CHECK-NEXT: [[TMP24:%.*]] = fadd fast <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD]]
129+
; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <vscale x 4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD1]]
130+
; CHECK-NEXT: [[TMP26:%.*]] = add nuw nsw i64 [[TMP6]], 16000
131+
; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[TMP11]], 16000
132+
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP26]]
133+
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP27]]
134+
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i32 0
135+
; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
136+
; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 4
137+
; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP32]]
138+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP24]], ptr [[TMP30]], align 4
139+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP25]], ptr [[TMP33]], align 4
140+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
141+
; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
142+
; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
143+
; CHECK: [[MIDDLE_BLOCK]]:
144+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16000, [[N_VEC]]
145+
; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
146+
; CHECK: [[SCALAR_PH]]:
147+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
148+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
149+
; CHECK: [[FOR_COND_CLEANUP]]:
150+
; CHECK-NEXT: ret void
151+
; CHECK: [[FOR_BODY]]:
152+
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
153+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
154+
; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr [[ARRAYIDX]], align 4
155+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[INDVARS_IV]]
156+
; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
157+
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP36]], [[TMP35]]
158+
; CHECK-NEXT: [[TMP37:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 16000
159+
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP37]]
160+
; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX5]], align 4
161+
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
162+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16000
163+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
164+
;
165+
entry:
166+
br label %for.body
167+
168+
for.cond.cleanup:
169+
ret void
170+
171+
for.body:
172+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
173+
%arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
174+
%0 = load float, ptr %arrayidx, align 4
175+
%arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
176+
%1 = load float, ptr %arrayidx2, align 4
177+
%add = fadd fast float %1, %0
178+
%2 = add nuw nsw i64 %indvars.iv, 16000
179+
%arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
180+
store float %add, ptr %arrayidx5, align 4
181+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
182+
%exitcond.not = icmp eq i64 %indvars.iv.next, 16000
183+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
184+
}
185+
186+
attributes #0 = { mustprogress nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable vscale_range(1,16) "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+sve,+sve2,+v9a" "unsafe-fp-math"="true" }
187+
188+
attributes #1 = { mustprogress nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable vscale_range(1,16) "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v1" "target-features"="+sve,+v9a" "unsafe-fp-math"="true" }
189+
;.
190+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
191+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
192+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
193+
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
194+
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
195+
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
196+
;.

0 commit comments

Comments
 (0)