Skip to content

Commit 5a4b1b0

Browse files
committed
[LV][AArch64] Prefer Fixed over Scalable if cost-model is equal (Neoverse V2)
For the Neoverse V2 we would like to prefer fixed width over scalable vectorisation if the cost-model assigns an equal cost to both for certain loops. This improves 7 kernels from TSVC-2 and several production kernels by about 2x, and does not affect SPEC21017 INT and FP. This also adds a new TTI hook that can steer the loop vectorizater to preferring fixed width vectorization, which can be set per CPU. For now, this is only enabled for the Neoverse V2. There are 3 reasons why preferring NEON might be better in the case the cost-model is a tie and the SVE vector size is the same as NEON (128-bit): architectural reasons, micro-architecture reasons, and SVE codegen reasons. The latter will be improved over time, so the more important reasons are the former two. I.e., (micro) architecture reason is the use of LPD/STP instructions which are not available in SVE2 and it avoids predication. For what it is worth: this codegen strategy to generate more NEON is inline with GCC's codegen strategy, which is actually even more aggressive in generating NEON when no predication is required. We could be smarter about the decision making, but this seems to be a first good step in the right direction, and we can always revise this later (for example make the target hook more general).
1 parent 35ddc17 commit 5a4b1b0

File tree

8 files changed

+224
-1
lines changed

8 files changed

+224
-1
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1674,6 +1674,11 @@ class TargetTransformInfo {
16741674
false; ///< If op is an fp min/max, whether NaNs may be present.
16751675
};
16761676

1677+
/// \returns True if the targets prefers fixed width vectorization if the
1678+
/// loop vectorizer's cost-model assigns an equal cost to the fixed and
1679+
/// scalable version of the vectorized loop.
1680+
bool preferFixedOverScalableIfEqualCost() const;
1681+
16771682
/// \returns True if the target prefers reductions in loop.
16781683
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
16791684
ReductionFlags Flags) const;
@@ -2143,6 +2148,7 @@ class TargetTransformInfo::Concept {
21432148
virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
21442149
unsigned ChainSizeInBytes,
21452150
VectorType *VecTy) const = 0;
2151+
virtual bool preferFixedOverScalableIfEqualCost() const = 0;
21462152
virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
21472153
ReductionFlags) const = 0;
21482154
virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
@@ -2873,6 +2879,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
28732879
VectorType *VecTy) const override {
28742880
return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
28752881
}
2882+
bool preferFixedOverScalableIfEqualCost() const override {
2883+
return Impl.preferFixedOverScalableIfEqualCost();
2884+
}
28762885
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
28772886
ReductionFlags Flags) const override {
28782887
return Impl.preferInLoopReduction(Opcode, Ty, Flags);

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -913,6 +913,8 @@ class TargetTransformInfoImplBase {
913913
return VF;
914914
}
915915

916+
bool preferFixedOverScalableIfEqualCost() const { return false; }
917+
916918
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
917919
TTI::ReductionFlags Flags) const {
918920
return false;

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1282,6 +1282,10 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
12821282
return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
12831283
}
12841284

1285+
bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const {
1286+
return TTIImpl->preferFixedOverScalableIfEqualCost();
1287+
}
1288+
12851289
bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty,
12861290
ReductionFlags Flags) const {
12871291
return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags);

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,10 @@ def FeatureExperimentalZeroingPseudos
244244
def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
245245
"UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
246246

247+
def FeatureUseFixedOverScalableIfEqualCost: SubtargetFeature<"use-fixed-over-scalable-equal-cost",
248+
"UseFixedOverScalableIfEqualCost", "true",
249+
"Prefer fixed width loop vectorization over scalable if the cost-model assigns equal costs">;
250+
247251
def FeatureBF16 : Extension<"bf16", "BF16",
248252
"Enable BFloat16 Extension (FEAT_BF16)", [],
249253
"FEAT_BF16", "+bf16", 280>;

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
489489
FeatureALULSLFast,
490490
FeaturePostRAScheduler,
491491
FeatureEnableSelectOptimize,
492+
FeatureUseFixedOverScalableIfEqualCost,
492493
FeaturePredictableSelectIsExpensive]>;
493494

494495
def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
371371
return TailFoldingStyle::DataWithoutLaneMask;
372372
}
373373

374+
bool preferFixedOverScalableIfEqualCost() const {
375+
return ST->useFixedOverScalableIfEqualCost();
376+
}
377+
374378
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
375379

376380
bool supportsScalableVectors() const { return ST->hasSVE(); }

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4780,7 +4780,10 @@ bool LoopVectorizationPlanner::isMoreProfitable(
47804780
// Assume vscale may be larger than 1 (or the value being tuned for),
47814781
// so that scalable vectorization is slightly favorable over fixed-width
47824782
// vectorization.
4783-
bool PreferScalable = A.Width.isScalable() && !B.Width.isScalable();
4783+
bool PreferScalable = A.Width.isScalable() && !B.Width.isScalable();
4784+
if (TTI.preferFixedOverScalableIfEqualCost())
4785+
PreferScalable = false;
4786+
47844787
auto CmpFn = [PreferScalable](const InstructionCost &LHS,
47854788
const InstructionCost &RHS) {
47864789
return PreferScalable ? LHS <= RHS : LHS < RHS;
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S < %s -passes=loop-vectorize | FileCheck %s
3+
4+
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
5+
target triple = "aarch64-unknown-linux-gnu"
6+
7+
@a = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
8+
@b = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
9+
10+
define void @NeoverseV2() local_unnamed_addr #0 {
11+
; CHECK-LABEL: define void @NeoverseV2(
12+
; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
13+
; CHECK-NEXT: [[ENTRY:.*]]:
14+
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
15+
; CHECK: [[VECTOR_PH]]:
16+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
17+
; CHECK: [[VECTOR_BODY]]:
18+
; CHECK-NEXT: [[TMP30:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
19+
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP30]], 0
20+
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP30]], 4
21+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP9]]
22+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP1]]
23+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
24+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
25+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
26+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
27+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[TMP9]]
28+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[TMP1]]
29+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
30+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 4
31+
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
32+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
33+
; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD]]
34+
; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD1]]
35+
; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP9]], 16000
36+
; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i64 [[TMP1]], 16000
37+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP12]]
38+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP13]]
39+
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
40+
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 4
41+
; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[TMP16]], align 4
42+
; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[TMP17]], align 4
43+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP30]], 8
44+
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16000
45+
; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
46+
; CHECK: [[MIDDLE_BLOCK]]:
47+
; CHECK-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
48+
; CHECK: [[SCALAR_PH]]:
49+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
50+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
51+
; CHECK: [[FOR_COND_CLEANUP]]:
52+
; CHECK-NEXT: ret void
53+
; CHECK: [[FOR_BODY]]:
54+
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
55+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
56+
; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4
57+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[INDVARS_IV]]
58+
; CHECK-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
59+
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP20]], [[TMP19]]
60+
; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 16000
61+
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP21]]
62+
; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX5]], align 4
63+
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
64+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16000
65+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
66+
;
67+
entry:
68+
br label %for.body
69+
70+
for.cond.cleanup:
71+
ret void
72+
73+
for.body:
74+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
75+
%arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
76+
%0 = load float, ptr %arrayidx, align 4
77+
%arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
78+
%1 = load float, ptr %arrayidx2, align 4
79+
%add = fadd fast float %1, %0
80+
%2 = add nuw nsw i64 %indvars.iv, 16000
81+
%arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
82+
store float %add, ptr %arrayidx5, align 4
83+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
84+
%exitcond.not = icmp eq i64 %indvars.iv.next, 16000
85+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
86+
}
87+
88+
define void @GenericCPU() #1 {
89+
; CHECK-LABEL: define void @GenericCPU(
90+
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
91+
; CHECK-NEXT: [[ENTRY:.*]]:
92+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
93+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
94+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16000, [[TMP1]]
95+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
96+
; CHECK: [[VECTOR_PH]]:
97+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
98+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
99+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 16000, [[TMP3]]
100+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 16000, [[N_MOD_VF]]
101+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
102+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
103+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
104+
; CHECK: [[VECTOR_BODY]]:
105+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
106+
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
107+
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
108+
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
109+
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0
110+
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1
111+
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
112+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP6]]
113+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP11]]
114+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
115+
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
116+
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
117+
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP16]]
118+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
119+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
120+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[TMP6]]
121+
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[TMP11]]
122+
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i32 0
123+
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
124+
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
125+
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP22]]
126+
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP20]], align 4
127+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP23]], align 4
128+
; CHECK-NEXT: [[TMP24:%.*]] = fadd fast <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD]]
129+
; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <vscale x 4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD1]]
130+
; CHECK-NEXT: [[TMP26:%.*]] = add nuw nsw i64 [[TMP6]], 16000
131+
; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[TMP11]], 16000
132+
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP26]]
133+
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP27]]
134+
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i32 0
135+
; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
136+
; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 4
137+
; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP32]]
138+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP24]], ptr [[TMP30]], align 4
139+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP25]], ptr [[TMP33]], align 4
140+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
141+
; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
142+
; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
143+
; CHECK: [[MIDDLE_BLOCK]]:
144+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16000, [[N_VEC]]
145+
; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
146+
; CHECK: [[SCALAR_PH]]:
147+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
148+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
149+
; CHECK: [[FOR_COND_CLEANUP]]:
150+
; CHECK-NEXT: ret void
151+
; CHECK: [[FOR_BODY]]:
152+
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
153+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
154+
; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr [[ARRAYIDX]], align 4
155+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[INDVARS_IV]]
156+
; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
157+
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP36]], [[TMP35]]
158+
; CHECK-NEXT: [[TMP37:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 16000
159+
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP37]]
160+
; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX5]], align 4
161+
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
162+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16000
163+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
164+
;
165+
entry:
166+
br label %for.body
167+
168+
for.cond.cleanup:
169+
ret void
170+
171+
for.body:
172+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
173+
%arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
174+
%0 = load float, ptr %arrayidx, align 4
175+
%arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
176+
%1 = load float, ptr %arrayidx2, align 4
177+
%add = fadd fast float %1, %0
178+
%2 = add nuw nsw i64 %indvars.iv, 16000
179+
%arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
180+
store float %add, ptr %arrayidx5, align 4
181+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
182+
%exitcond.not = icmp eq i64 %indvars.iv.next, 16000
183+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
184+
}
185+
186+
attributes #0 = { mustprogress nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve,+sve2,+v9a" }
187+
188+
attributes #1 = { mustprogress nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable vscale_range(1,16) "target-cpu"="generic" "target-features"="+sve,+v9a" }
189+
;.
190+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
191+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
192+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
193+
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
194+
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
195+
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
196+
;.

0 commit comments

Comments
 (0)