Skip to content

Commit c7dbe32

Browse files
committed
[AArch64][LoopVectorize] Enable tail-folding of simple loops on neoverse-v1
This patch enables the tail-folding of simple loops by default when targeting the neoverse-v1 CPU. Simple loops exclude those with recurrences or reductions or loops that are reversed. New tests have been added here: Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll In terms of SPEC2017 only one benchmark is really affected when building with "-Ofast -mcpu=neoverse-v1 -flto", which is (+ faster, - slower): 525.x264: +7.0% Differential Revision: https://reviews.llvm.org/D130618
1 parent 01efcec commit c7dbe32

File tree

5 files changed

+74
-14
lines changed

5 files changed

+74
-14
lines changed

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ void AArch64Subtarget::initializeProperties() {
226226
PrefLoopAlignment = Align(32);
227227
MaxBytesForLoopAlignment = 16;
228228
VScaleForTuning = 2;
229+
DefaultSVETFOpts = TailFoldingOpts::Simple;
229230
break;
230231
case Neoverse512TVB:
231232
PrefFunctionAlignment = Align(16);

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
3939
static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
4040
cl::init(10), cl::Hidden);
4141

42+
static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
43+
cl::init(15), cl::Hidden);
44+
4245
namespace {
4346
class TailFoldingOption {
4447
// These bitfields will only ever be set to something non-zero in operator=,
@@ -3558,8 +3561,19 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
35583561
if (Required == TailFoldingOpts::Disabled)
35593562
Required |= TailFoldingOpts::Simple;
35603563

3561-
return TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
3562-
Required);
3564+
if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
3565+
Required))
3566+
return false;
3567+
3568+
// Don't tail-fold for tight loops where we would be better off interleaving
3569+
// with an unpredicated loop.
3570+
unsigned NumInsns = 0;
3571+
for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
3572+
NumInsns += BB->sizeWithoutDebug();
3573+
}
3574+
3575+
// We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
3576+
return NumInsns >= SVETailFoldInsnThreshold;
35633577
}
35643578

35653579
InstructionCost

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
2-
; RUN: -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG
3-
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
4-
; RUN: -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG
2+
; RUN: -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG
53
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
64
; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
75
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
1-
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF
2-
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF
3-
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF
4-
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF
5-
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED
6-
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
7-
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all+noreverse -S | FileCheck %s -check-prefix=CHECK-TF-NOREV
8-
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED
1+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF
2+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF
3+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S | FileCheck %s -check-prefix=CHECK-NOTF
4+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF
5+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF
6+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -mcpu=neoverse-v1 -sve-tail-folding=default+reductions+recurrences+reverse | FileCheck %s -check-prefix=CHECK-TF
7+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED
8+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
9+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all+noreverse -S | FileCheck %s -check-prefix=CHECK-TF-NOREV
10+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED
11+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -sve-tail-folding=default -mcpu=neoverse-v1 | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
12+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -mcpu=neoverse-v1 -sve-tail-folding=default | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
13+
; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -mcpu=neoverse-v1 | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
914

1015
target triple = "aarch64-unknown-linux-gnu"
1116

@@ -58,6 +63,14 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
5863
; CHECK-TF-ONLYRED-NOT: %{{.*}} = phi <vscale x 4 x i1>
5964
; CHECK-TF-ONLYRED: store <vscale x 4 x i32> %[[SPLAT]], ptr
6065

66+
; CHECK-NEOVERSE-V1-LABEL: @simple_memset(
67+
; CHECK-NEOVERSE-V1: vector.ph:
68+
; CHECK-NEOVERSE-V1: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i64 0
69+
; CHECK-NEOVERSE-V1: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
70+
; CHECK-NEOVERSE-V1: vector.body:
71+
; CHECK-NEOVERSE-V1: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
72+
; CHECK-NEOVERSE-V1: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
73+
6174
entry:
6275
br label %while.body
6376

@@ -129,6 +142,15 @@ define float @fadd_red_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
129142
; CHECK-TF-ONLYRED: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
130143
; CHECK-TF-ONLYRED: middle.block:
131144
; CHECK-TF-ONLYRED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
145+
146+
; CHECK-NEOVERSE-V1-LABEL: @fadd_red_fast
147+
; CHECK-NEOVERSE-V1: vector.body:
148+
; CHECK-NEOVERSE-V1-NOT: %{{.*}} = phi <vscale x 4 x i1>
149+
; CHECK-NEOVERSE-V1: %[[LOAD:.*]] = load <vscale x 4 x float>
150+
; CHECK-NEOVERSE-V1: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
151+
; CHECK-NEOVERSE-V1: middle.block:
152+
; CHECK-NEOVERSE-V1-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
153+
132154
entry:
133155
br label %for.body
134156

@@ -225,6 +247,19 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
225247
; CHECK-TF-ONLYRED: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
226248
; CHECK-TF-ONLYRED: store <vscale x 4 x i32> %[[ADD]]
227249

250+
; CHECK-NEOVERSE-V1-LABEL: @add_recur
251+
; CHECK-NEOVERSE-V1: entry:
252+
; CHECK-NEOVERSE-V1: %[[PRE:.*]] = load i32, ptr %src, align 4
253+
; CHECK-NEOVERSE-V1: vector.ph:
254+
; CHECK-NEOVERSE-V1: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
255+
; CHECK-NEOVERSE-V1: vector.body:
256+
; CHECK-NEOVERSE-V1-NOT: %{{.*}} = phi <vscale x 4 x i1>
257+
; CHECK-NEOVERSE-V1: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
258+
; CHECK-NEOVERSE-V1: %[[LOAD]] = load <vscale x 4 x i32>
259+
; CHECK-NEOVERSE-V1: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
260+
; CHECK-NEOVERSE-V1: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
261+
; CHECK-NEOVERSE-V1: store <vscale x 4 x i32> %[[ADD]]
262+
228263
entry:
229264
%.pre = load i32, ptr %src, align 4
230265
br label %for.body
@@ -276,6 +311,12 @@ define void @interleave(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
276311
; CHECK-TF-NOREV: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
277312
; CHECK-TF-NOREV: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
278313

314+
; CHECK-NEOVERSE-V1-LABEL: @interleave(
315+
; CHECK-NEOVERSE-V1: vector.body:
316+
; CHECK-NEOVERSE-V1: %[[LOAD:.*]] = load <8 x float>, ptr
317+
; CHECK-NEOVERSE-V1: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
318+
; CHECK-NEOVERSE-V1: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
319+
279320
entry:
280321
br label %for.body
281322

@@ -335,6 +376,12 @@ define void @reverse(ptr noalias %dst, ptr noalias %src) #0 {
335376
; CHECK-TF-NOREC: %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
336377
; CHECK-TF-NOREC: %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse
337378

379+
; CHECK-TF-NEOVERSE-V1-LABEL: @reverse(
380+
; CHECK-TF-NEOVERSE-V1: vector.body:
381+
; CHECK-TF-NEOVERSE-V1-NOT: %{{.*}} = phi <vscale x 4 x i1>
382+
; CHECK-TF-NEOVERSE-V1: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* %18, align 8
383+
; CHECK-TF-NEOVERSE-V1: %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
384+
338385
entry:
339386
br label %for.body
340387

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt -passes='loop-vectorize,instcombine' -sve-tail-folding=all -S < %s | FileCheck %s
2+
; RUN: opt -passes='loop-vectorize,instcombine' -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all -S < %s | FileCheck %s
33

44
target triple = "aarch64"
55

0 commit comments

Comments
 (0)