[AArch64] Set MaxInterleaving to 4 for Neoverse V2

sjoerdmeijer · sjoerdmeijer · commit a834320d8a38 · 2024-10-09T06:05:22.000-07:00
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -626,6 +626,10 @@ class TargetTransformInfo {
                                 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
                                 HardwareLoopInfo &HWLoopInfo) const;
 
+  // Query the target for which minimum vectorization factor epilogue
+  // vectorization should be considered.
+  unsigned getEpilogueVectorizationMinVF() const;
+
   /// Query the target whether it would be prefered to create a predicated
   /// vector loop, which can avoid the need to emit a scalar epilogue loop.
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const;
@@ -1865,6 +1869,7 @@ class TargetTransformInfo::Concept {
                                         AssumptionCache &AC,
                                         TargetLibraryInfo *LibInfo,
                                         HardwareLoopInfo &HWLoopInfo) = 0;
+  virtual unsigned getEpilogueVectorizationMinVF() = 0;
   virtual bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) = 0;
   virtual TailFoldingStyle
   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0;
@@ -2319,6 +2324,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                                 HardwareLoopInfo &HWLoopInfo) override {
     return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
   }
+  unsigned getEpilogueVectorizationMinVF() override {
+    return Impl.getEpilogueVectorizationMinVF();
+  }
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) override {
     return Impl.preferPredicateOverEpilogue(TFI);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -192,6 +192,8 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  unsigned getEpilogueVectorizationMinVF() const { return 16; }
+
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; }
 
   TailFoldingStyle
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -667,6 +667,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
   }
 
+  unsigned getEpilogueVectorizationMinVF() {
+    return BaseT::getEpilogueVectorizationMinVF();
+  }
+
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
     return BaseT::preferPredicateOverEpilogue(TFI);
   }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -352,6 +352,10 @@ bool TargetTransformInfo::isHardwareLoopProfitable(
   return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
 }
 
+unsigned TargetTransformInfo::getEpilogueVectorizationMinVF() const {
+  return TTIImpl->getEpilogueVectorizationMinVF();
+}
+
 bool TargetTransformInfo::preferPredicateOverEpilogue(
     TailFoldingInfo *TFI) const {
   return TTIImpl->preferPredicateOverEpilogue(TFI);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -234,12 +234,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     MaxBytesForLoopAlignment = 16;
     break;
   case NeoverseV2:
-    // Specialize cost for Neoverse-V2.
+  case NeoverseV3:
+    EpilogueVectorizationMinVF = 8;
+    MaxInterleaveFactor = 4;
     ScatterOverhead = 13;
     LLVM_FALLTHROUGH;
   case NeoverseN2:
   case NeoverseN3:
-  case NeoverseV3:
     PrefFunctionAlignment = Align(16);
     PrefLoopAlignment = Align(32);
     MaxBytesForLoopAlignment = 16;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -56,6 +56,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   bool ATTRIBUTE = DEFAULT;
 #include "AArch64GenSubtargetInfo.inc"
 
+  unsigned EpilogueVectorizationMinVF = 16;
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 2;
   uint16_t CacheLineSize = 0;
@@ -225,6 +226,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
            hasFuseAdrpAdd() || hasFuseLiterals();
   }
 
+  unsigned getEpilogueVectorizationMinVF() const {
+    return EpilogueVectorizationMinVF;
+  }
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
   unsigned getVectorInsertExtractBaseCost() const;
   unsigned getCacheLineSize() const override { return CacheLineSize; }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4581,6 +4581,10 @@ static bool containsDecreasingPointers(Loop *TheLoop,
   return false;
 }
 
+unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
+  return ST->getEpilogueVectorizationMinVF();
+}
+
 bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
   if (!ST->hasSVE())
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -376,6 +376,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return ST->useFixedOverScalableIfEqualCost();
   }
 
+  unsigned getEpilogueVectorizationMinVF() const;
+
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
 
   bool supportsScalableVectors() const {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -185,7 +185,7 @@ static cl::opt<unsigned> EpilogueVectorizationForceVF(
              "loops."));
 
 static cl::opt<unsigned> EpilogueVectorizationMinVF(
-    "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
+    "epilogue-vectorization-minimum-VF", cl::Hidden,
     cl::desc("Only loops with vectorization factor equal to or larger than "
              "the specified value are considered for epilogue vectorization."));
 
@@ -4644,7 +4644,11 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
   if (TTI.getMaxInterleaveFactor(VF) <= 1)
     return false;
 
-  if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
+   unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
+                                ? EpilogueVectorizationMinVF
+                                : TTI.getEpilogueVectorizationMinVF();
+
+   if ((Multiplier * VF.getKnownMinValue()) >= MinVFThreshold)
     return true;
   return false;
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
@@ -5,6 +5,8 @@
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
 
 ; Tests for selecting interleave counts for loops with loads and stores.
 
@@ -213,6 +215,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-2:       exit:
 ; INTERLEAVE-2-NEXT:    ret void
 ;
+; INTERLEAVE-4-VLA-LABEL: @interleave_single_load_store(
+; INTERLEAVE-4-VLA:       call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+; INTERLEAVE-4-VLA-NEXT:  call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+; INTERLEAVE-4-VLA-NEXT:  call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+; INTERLEAVE-4-VLA-NEXT:  call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+;
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -5,6 +5,8 @@
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
 
 ; Tests for selecting the interleave count for loops with reductions.
 
@@ -138,6 +140,12 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-2-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; INTERLEAVE-2-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
 ;
+; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction(
+; INTERLEAVE-4-VLA:       add <vscale x 4 x i32>
+; INTERLEAVE-4-VLA-NEXT:  add <vscale x 4 x i32>
+; INTERLEAVE-4-VLA-NEXT:  add <vscale x 4 x i32>
+; INTERLEAVE-4-VLA-NEXT:  add <vscale x 4 x i32>
+;
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll
@@ -0,0 +1,118 @@
+; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+define noundef i32 @V1(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #0 {
+; CHECK-LABEL: @V1(
+; CHECK-NOT:   vec.epilog.ph:
+; CHECK-NOT:   vec.epilog.vector.body:
+; CHECK-NOT:   vec.epilog.middle.block:
+; CHECK-NOT:   vec.epilog.scalar.ph:
+;
+entry:
+  %4 = icmp sgt i32 %2, 0
+  br i1 %4, label %5, label %8
+
+5:
+  %6 = zext nneg i32 %2 to i64
+  br label %9
+
+7:
+  br label %8
+
+8:
+  ret i32 42
+
+9:
+  %10 = phi i64 [ 0, %5 ], [ %16, %9 ]
+  %11 = getelementptr inbounds double, ptr %0, i64 %10
+  %12 = load double, ptr %11, align 8
+  %13 = getelementptr inbounds double, ptr %1, i64 %10
+  %14 = load double, ptr %13, align 8
+  %15 = fadd fast double %14, %12
+  store double %15, ptr %11, align 8
+  %16 = add nuw nsw i64 %10, 1
+  %17 = icmp eq i64 %16, %6
+  br i1 %17, label %7, label %9
+}
+
+define noundef i32 @V2(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #1 {
+;
+; CHECK-LABEL: @V2(
+; CHECK:       vec.epilog.ph:
+; CHECK:       vec.epilog.vector.body:
+; CHECK:       vec.epilog.middle.block:
+; CHECK:       vec.epilog.scalar.ph:
+;
+entry:
+  %4 = icmp sgt i32 %2, 0
+  br i1 %4, label %5, label %8
+
+5:
+  %6 = zext nneg i32 %2 to i64
+  br label %9
+
+7:
+  br label %8
+
+8:
+  ret i32 42
+
+9:
+  %10 = phi i64 [ 0, %5 ], [ %16, %9 ]
+  %11 = getelementptr inbounds double, ptr %0, i64 %10
+  %12 = load double, ptr %11, align 8
+  %13 = getelementptr inbounds double, ptr %1, i64 %10
+  %14 = load double, ptr %13, align 8
+  %15 = fadd fast double %14, %12
+  store double %15, ptr %11, align 8
+  %16 = add nuw nsw i64 %10, 1
+  %17 = icmp eq i64 %16, %6
+  br i1 %17, label %7, label %9
+}
+
+; TODO: The V3 will generate a scalable vector body, so doesn't need a
+; epilogue loop, but will need to be checked that is really the best thing to
+; for the V3.
+;
+define noundef i32 @V3(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #2 {
+;
+; CHECK-LABEL: @V3(
+; CHECK-NOT:   vec.epilog.ph:
+; CHECK-NOT:   vec.epilog.vector.body:
+; CHECK-NOT:   vec.epilog.middle.block:
+; CHECK-NOT:   vec.epilog.scalar.ph:
+;
+entry:
+  %4 = icmp sgt i32 %2, 0
+  br i1 %4, label %5, label %8
+
+5:
+  %6 = zext nneg i32 %2 to i64
+  br label %9
+
+7:
+  br label %8
+
+8:
+  ret i32 42
+
+9:
+  %10 = phi i64 [ 0, %5 ], [ %16, %9 ]
+  %11 = getelementptr inbounds double, ptr %0, i64 %10
+  %12 = load double, ptr %11, align 8
+  %13 = getelementptr inbounds double, ptr %1, i64 %10
+  %14 = load double, ptr %13, align 8
+  %15 = fadd fast double %14, %12
+  store double %15, ptr %11, align 8
+  %16 = add nuw nsw i64 %10, 1
+  %17 = icmp eq i64 %16, %6
+  br i1 %17, label %7, label %9
+}
+
+attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v1" "target-features"="+sve2" }
+
+attributes #1 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve2" }
+
+attributes #2 = { vscale_range(1,16) "target-cpu"="neoverse-v3" "target-features"="+sve2" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
 ; RUN:   -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG
 ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
-; RUN:   -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
+; RUN:   -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-EPILOG
 ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
 ; RUN:   -mcpu=cortex-x2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
 

Original file line number	Diff line number	Diff line change
`@@ -192,6 +192,8 @@ class TargetTransformInfoImplBase {`
`192`	`192`	`return false;`
`193`	`193`	`}`
`194`	`194`
	`195`	`+ unsigned getEpilogueVectorizationMinVF() const { return 16; }`
	`196`	`+`
`195`	`197`	`bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; }`
`196`	`198`
`197`	`199`	`TailFoldingStyle`
Original file line number	Diff line number	Diff line change
`@@ -667,6 +667,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {`
`667`	`667`	`return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);`
`668`	`668`	`}`
`669`	`669`
	`670`	`+ unsigned getEpilogueVectorizationMinVF() {`
	`671`	`+ return BaseT::getEpilogueVectorizationMinVF();`
	`672`	`+ }`
	`673`	`+`
`670`	`674`	`bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {`
`671`	`675`	`return BaseT::preferPredicateOverEpilogue(TFI);`
`672`	`676`	`}`
Original file line number	Diff line number	Diff line change
`@@ -376,6 +376,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {`
`376`	`376`	`return ST->useFixedOverScalableIfEqualCost();`
`377`	`377`	`}`
`378`	`378`
	`379`	`+ unsigned getEpilogueVectorizationMinVF() const;`
	`380`	`+`
`379`	`381`	`bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);`
`380`	`382`
`381`	`383`	`bool supportsScalableVectors() const {`