Skip to content

[AArch64] Set MaxInterleaving to 4 for Neoverse V2 and V3 #100385

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,10 @@ class TargetTransformInfo {
AssumptionCache &AC, TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) const;

// Query the target for which minimum vectorization factor epilogue
// vectorization should be considered.
unsigned getEpilogueVectorizationMinVF() const;

/// Query the target whether it would be prefered to create a predicated
/// vector loop, which can avoid the need to emit a scalar epilogue loop.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const;
Expand Down Expand Up @@ -1912,6 +1916,7 @@ class TargetTransformInfo::Concept {
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) = 0;
virtual unsigned getEpilogueVectorizationMinVF() = 0;
virtual bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) = 0;
virtual TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0;
Expand Down Expand Up @@ -2392,6 +2397,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
HardwareLoopInfo &HWLoopInfo) override {
return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}
unsigned getEpilogueVectorizationMinVF() override {
return Impl.getEpilogueVectorizationMinVF();
}
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) override {
return Impl.preferPredicateOverEpilogue(TFI);
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ class TargetTransformInfoImplBase {
return false;
}

unsigned getEpilogueVectorizationMinVF() const { return 16; }

bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; }

TailFoldingStyle
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}

unsigned getEpilogueVectorizationMinVF() {
return BaseT::getEpilogueVectorizationMinVF();
}

bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
return BaseT::preferPredicateOverEpilogue(TFI);
}
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,10 @@ bool TargetTransformInfo::isHardwareLoopProfitable(
return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}

unsigned TargetTransformInfo::getEpilogueVectorizationMinVF() const {
return TTIImpl->getEpilogueVectorizationMinVF();
}

bool TargetTransformInfo::preferPredicateOverEpilogue(
TailFoldingInfo *TFI) const {
return TTIImpl->preferPredicateOverEpilogue(TFI);
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/AArch64/AArch64Subtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,12 +255,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxBytesForLoopAlignment = 16;
break;
case NeoverseV2:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment here to explain why "4" is chosen would be nice.

// Specialize cost for Neoverse-V2.
case NeoverseV3:
EpilogueVectorizationMinVF = 8;
MaxInterleaveFactor = 4;
ScatterOverhead = 13;
LLVM_FALLTHROUGH;
case NeoverseN2:
case NeoverseN3:
case NeoverseV3:
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(32);
MaxBytesForLoopAlignment = 16;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool ATTRIBUTE = DEFAULT;
#include "AArch64GenSubtargetInfo.inc"

unsigned EpilogueVectorizationMinVF = 16;
uint8_t MaxInterleaveFactor = 2;
uint8_t VectorInsertExtractBaseCost = 2;
uint16_t CacheLineSize = 0;
Expand Down Expand Up @@ -237,6 +238,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
hasFuseAdrpAdd() || hasFuseLiterals();
}

unsigned getEpilogueVectorizationMinVF() const {
return EpilogueVectorizationMinVF;
}
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
unsigned getVectorInsertExtractBaseCost() const;
unsigned getCacheLineSize() const override { return CacheLineSize; }
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4736,6 +4736,10 @@ static bool containsDecreasingPointers(Loop *TheLoop,
return false;
}

unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
return ST->getEpilogueVectorizationMinVF();
}

bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
if (!ST->hasSVE())
return false;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return ST->useFixedOverScalableIfEqualCost();
}

unsigned getEpilogueVectorizationMinVF() const;

bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);

bool supportsScalableVectors() const {
Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ static cl::opt<unsigned> EpilogueVectorizationForceVF(
"loops."));

static cl::opt<unsigned> EpilogueVectorizationMinVF(
"epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
"epilogue-vectorization-minimum-VF", cl::Hidden,
cl::desc("Only loops with vectorization factor equal to or larger than "
"the specified value are considered for epilogue vectorization."));

Expand Down Expand Up @@ -4701,8 +4701,10 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
// See related "TODO: extend to support scalable VFs." in
// selectEpilogueVectorizationFactor.
unsigned Multiplier = VF.isFixed() ? IC : 1;
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >=
EpilogueVectorizationMinVF;
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
? EpilogueVectorizationMinVF
: TTI.getEpilogueVectorizationMinVF();
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
}

VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s

; Tests for selecting interleave counts for loops with loads and stores.

Expand Down Expand Up @@ -213,6 +215,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
; INTERLEAVE-2: exit:
; INTERLEAVE-2-NEXT: ret void
;
; INTERLEAVE-4-VLA-LABEL: @interleave_single_load_store(
; INTERLEAVE-4-VLA: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
;
entry:
br label %loop

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s

; Tests for selecting the interleave count for loops with reductions.

Expand Down Expand Up @@ -138,6 +140,12 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
; INTERLEAVE-2-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; INTERLEAVE-2-NEXT: ret i32 [[RED_NEXT_LCSSA]]
;
; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction(
; INTERLEAVE-4-VLA: add <vscale x 4 x i32>
; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
;
entry:
br label %loop

Expand Down
118 changes: 118 additions & 0 deletions llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s

target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "aarch64-unknown-linux-gnu"

define noundef i32 @V1(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #0 {
; CHECK-LABEL: @V1(
; CHECK-NOT: vec.epilog.ph:
; CHECK-NOT: vec.epilog.vector.body:
; CHECK-NOT: vec.epilog.middle.block:
; CHECK-NOT: vec.epilog.scalar.ph:
;
entry:
%4 = icmp sgt i32 %2, 0
br i1 %4, label %5, label %8

5:
%6 = zext nneg i32 %2 to i64
br label %9

7:
br label %8

8:
ret i32 42

9:
%10 = phi i64 [ 0, %5 ], [ %16, %9 ]
%11 = getelementptr inbounds double, ptr %0, i64 %10
%12 = load double, ptr %11, align 8
%13 = getelementptr inbounds double, ptr %1, i64 %10
%14 = load double, ptr %13, align 8
%15 = fadd fast double %14, %12
store double %15, ptr %11, align 8
%16 = add nuw nsw i64 %10, 1
%17 = icmp eq i64 %16, %6
br i1 %17, label %7, label %9
}

define noundef i32 @V2(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #1 {
;
; CHECK-LABEL: @V2(
; CHECK: vec.epilog.ph:
; CHECK: vec.epilog.vector.body:
; CHECK: vec.epilog.middle.block:
; CHECK: vec.epilog.scalar.ph:
;
entry:
%4 = icmp sgt i32 %2, 0
br i1 %4, label %5, label %8

5:
%6 = zext nneg i32 %2 to i64
br label %9

7:
br label %8

8:
ret i32 42

9:
%10 = phi i64 [ 0, %5 ], [ %16, %9 ]
%11 = getelementptr inbounds double, ptr %0, i64 %10
%12 = load double, ptr %11, align 8
%13 = getelementptr inbounds double, ptr %1, i64 %10
%14 = load double, ptr %13, align 8
%15 = fadd fast double %14, %12
store double %15, ptr %11, align 8
%16 = add nuw nsw i64 %10, 1
%17 = icmp eq i64 %16, %6
br i1 %17, label %7, label %9
}

; TODO: The V3 will generate a scalable vector body, so doesn't need a
; epilogue loop, but will need to be checked that is really the best thing to
; for the V3.
;
define noundef i32 @V3(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #2 {
;
; CHECK-LABEL: @V3(
; CHECK-NOT: vec.epilog.ph:
; CHECK-NOT: vec.epilog.vector.body:
; CHECK-NOT: vec.epilog.middle.block:
; CHECK-NOT: vec.epilog.scalar.ph:
;
entry:
%4 = icmp sgt i32 %2, 0
br i1 %4, label %5, label %8

5:
%6 = zext nneg i32 %2 to i64
br label %9

7:
br label %8

8:
ret i32 42

9:
%10 = phi i64 [ 0, %5 ], [ %16, %9 ]
%11 = getelementptr inbounds double, ptr %0, i64 %10
%12 = load double, ptr %11, align 8
%13 = getelementptr inbounds double, ptr %1, i64 %10
%14 = load double, ptr %13, align 8
%15 = fadd fast double %14, %12
store double %15, ptr %11, align 8
%16 = add nuw nsw i64 %10, 1
%17 = icmp eq i64 %16, %6
br i1 %17, label %7, label %9
}

attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v1" "target-features"="+sve2" }

attributes #1 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve2" }

attributes #2 = { vscale_range(1,16) "target-cpu"="neoverse-v3" "target-features"="+sve2" }
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
; RUN: -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-EPILOG-V2
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
; RUN: -mcpu=cortex-x2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG

Expand All @@ -12,6 +12,11 @@ define void @foo(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i6
; CHECK-EPILOG: vec.epilog.vector.body:
; CHECK-EPILOG: load <vscale x 4 x i16>

; The epilogue loop gets vectorised vscale x 2 x i16 wide.
; CHECK-EPILOG-V2: vec.epilog.ph:
; CHECK-EPILOG-V2: vec.epilog.vector.body:
; CHECK-EPILOG-V2: load <vscale x 2 x i16>

; CHECK-NO-EPILOG-NOT: vec.epilog.vector.ph:
; CHECK-NO-EPILOG-NOT: vec.epilog.vector.body:
entry:
Expand Down
Loading