Skip to content

Revert "[AArch64][SVE] Enable max vector bandwidth for SVE" #112873

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 18, 2024

Conversation

huntergr-arm
Copy link
Collaborator

Reverts #109671

Reverting due to some performance regressions on neoverse-v1.

@llvmbot
Copy link
Member

llvmbot commented Oct 18, 2024

@llvm/pr-subscribers-backend-aarch64

Author: Graham Hunter (huntergr-arm)

Changes

Reverts llvm/llvm-project#109671

Reverting due to some performance regressions on neoverse-v1.


Patch is 54.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112873.diff

8 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+2-4)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll (+25-55)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll (+6-6)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll (+2-2)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll (+48-71)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll (+28-60)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll (+14-14)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll (+1-1)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7c6b789b9c1b72..ff3c69f7e10c66 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -337,10 +337,8 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
     TargetTransformInfo::RegisterKind K) const {
   assert(K != TargetTransformInfo::RGK_Scalar);
-  return ((K == TargetTransformInfo::RGK_FixedWidthVector &&
-           ST->isNeonAvailable()) ||
-          (K == TargetTransformInfo::RGK_ScalableVector &&
-           ST->isSVEorStreamingSVEAvailable()));
+  return (K == TargetTransformInfo::RGK_FixedWidthVector &&
+          ST->isNeonAvailable());
 }
 
 /// Calculate the cost of materializing a 64-bit value. This helper
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 01fca39296da09..7f325ce1a1f04b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -732,20 +732,9 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; DEFAULT-LABEL: define void @multiple_exit_conditions(
 ; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] {
 ; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 32
-; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP8]]
-; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]]
-; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]]
-; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i64 [[N_VEC]], 8
-; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
-; DEFAULT-NEXT:    [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
-; DEFAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 32
+; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048
 ; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DEFAULT:       vector.body:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -753,39 +742,20 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; DEFAULT-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP1]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP9:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP10:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP11:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP12:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP13:%.*]] = uitofp <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x double>
-; DEFAULT-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x double>
-; DEFAULT-NEXT:    [[TMP15:%.*]] = uitofp <vscale x 8 x i16> [[TMP11]] to <vscale x 8 x double>
-; DEFAULT-NEXT:    [[TMP16:%.*]] = uitofp <vscale x 8 x i16> [[TMP12]] to <vscale x 8 x double>
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; DEFAULT-NEXT:    [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
-; DEFAULT-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
-; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP19]]
-; DEFAULT-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 16
-; DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP22]]
-; DEFAULT-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], 24
-; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP25]]
-; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP13]], ptr [[TMP4]], align 8
-; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP14]], ptr [[TMP20]], align 8
-; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP15]], ptr [[TMP23]], align 8
-; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP16]], ptr [[TMP26]], align 8
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; DEFAULT-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; DEFAULT-NEXT:    store <8 x double> [[TMP3]], ptr [[TMP4]], align 8
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; DEFAULT-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; DEFAULT:       middle.block:
-; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; DEFAULT:       scalar.ph:
 ; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY:%.*]] ]
-; DEFAULT-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; DEFAULT-NEXT:    br label [[LOOP:%.*]]
 ; DEFAULT:       vector.scevcheck:
 ; DEFAULT-NEXT:    unreachable
@@ -810,7 +780,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; PRED-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
@@ -819,31 +789,31 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
 ; PRED-NEXT:    [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
 ; PRED-NEXT:    [[TMP8:%.*]] = sub i64 257, [[TMP7]]
 ; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 257, [[TMP7]]
 ; PRED-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 257)
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 257)
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PRED:       vector.body:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; PRED-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; PRED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
 ; PRED-NEXT:    [[TMP12:%.*]] = load i16, ptr [[SRC]], align 2
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP12]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP13:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; PRED-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x double>
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP12]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i64 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
 ; PRED-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv8f64.p0(<vscale x 8 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP10]])
-; PRED-NEXT:    [[TMP16:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; PRED-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 8 x i1> [[TMP16]], i32 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
+; PRED-NEXT:    [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
 ; PRED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
index 59da1e10fd2a07..f28f77bf1b1558 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -1,23 +1,23 @@
 ; REQUIRES: asserts
 ; RUN: opt -mtriple=aarch64 -mattr=+sve \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
 
 ; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
 
 ; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
 ; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
@@ -29,7 +29,7 @@
 ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
 
 ; VF-4: <4 x i32>
-; VF-VSCALE16: <vscale x 16 x i32>
+; VF-VSCALE4: <16 x i32>
 define void @test0(ptr %a, ptr %b, ptr %c) #0 {
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
index a84932a2290d67..e83eb729b521c2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -8,8 +8,8 @@
 ; (maximized bandwidth for i8 in the loop).
 define void @test0(ptr %a, ptr %b, ptr %c) #0 {
 ; CHECK: LV: Checking a loop in 'test0'
-; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 16
-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 16
+; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
 ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index a4861ad0b26196..7d2fc348480a09 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -145,7 +145,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; DEFAULT-NEXT:  iter.check:
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
 ; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; DEFAULT:       vector.memcheck:
@@ -155,72 +155,59 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; DEFAULT-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; DEFAULT:       vector.main.loop.iter.check:
-; DEFAULT-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP9]], 32
-; DEFAULT-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 0, [[TMP3]]
-; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
-; DEFAULT-NEXT:    [[N_MOD_VF1:%.*]] = urem i64 0, [[TMP5]]
-; DEFAULT-NEXT:    [[N_VEC1:%.*]] = sub i64 0, [[N_MOD_VF1]]
-; DEFAULT-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 32
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i16> poison, i16 [[X]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP8:%.*]] = trunc <vscale x 16 x i16> [[BROADCAST_SPLAT]] to <vscale x 16 x i8>
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT3]], <16 x i16> poison, <16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP7:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT4]] to <16 x i8>
 ; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DEFAULT:       vector.body:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; DEFAULT-NEXT:    [[TMP14:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]]
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP14]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP11:%.*]] = trunc <vscale x 16 x i64> [[BROADCAST_SPLAT3]] to <vscale x 16 x i8>
-; DEFAULT-NEXT:    [[TMP22:%.*]] = and <vscale x 16 x i8> [[TMP11]], [[TMP8]]
-; DEFAULT-NEXT:    [[TMP13:%.*]] = and <vscale x 16 x i8> [[TMP11]], [[TMP8]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TMP4]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP5:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8>
+; DEFAULT-NEXT:    [[TMP8:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; DEFAULT-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP16]], 16
-; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP23]]
-; DEFAULT-NEXT:    store <vscale x 16 x i8> [[TMP22]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
-; DEFAULT-NEXT:    store <vscale x 16 x i8> [...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Oct 18, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Graham Hunter (huntergr-arm)

Changes

Reverts llvm/llvm-project#109671

Reverting due to some performance regressions on neoverse-v1.


Patch is 54.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112873.diff

8 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+2-4)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll (+25-55)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll (+6-6)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll (+2-2)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll (+48-71)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll (+28-60)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll (+14-14)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll (+1-1)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7c6b789b9c1b72..ff3c69f7e10c66 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -337,10 +337,8 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
     TargetTransformInfo::RegisterKind K) const {
   assert(K != TargetTransformInfo::RGK_Scalar);
-  return ((K == TargetTransformInfo::RGK_FixedWidthVector &&
-           ST->isNeonAvailable()) ||
-          (K == TargetTransformInfo::RGK_ScalableVector &&
-           ST->isSVEorStreamingSVEAvailable()));
+  return (K == TargetTransformInfo::RGK_FixedWidthVector &&
+          ST->isNeonAvailable());
 }
 
 /// Calculate the cost of materializing a 64-bit value. This helper
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 01fca39296da09..7f325ce1a1f04b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -732,20 +732,9 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; DEFAULT-LABEL: define void @multiple_exit_conditions(
 ; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] {
 ; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 32
-; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP8]]
-; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]]
-; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]]
-; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i64 [[N_VEC]], 8
-; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
-; DEFAULT-NEXT:    [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
-; DEFAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 32
+; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048
 ; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DEFAULT:       vector.body:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -753,39 +742,20 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; DEFAULT-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP1]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP9:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP10:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP11:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP12:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP13:%.*]] = uitofp <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x double>
-; DEFAULT-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x double>
-; DEFAULT-NEXT:    [[TMP15:%.*]] = uitofp <vscale x 8 x i16> [[TMP11]] to <vscale x 8 x double>
-; DEFAULT-NEXT:    [[TMP16:%.*]] = uitofp <vscale x 8 x i16> [[TMP12]] to <vscale x 8 x double>
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; DEFAULT-NEXT:    [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
-; DEFAULT-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
-; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP19]]
-; DEFAULT-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 16
-; DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP22]]
-; DEFAULT-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], 24
-; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP25]]
-; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP13]], ptr [[TMP4]], align 8
-; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP14]], ptr [[TMP20]], align 8
-; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP15]], ptr [[TMP23]], align 8
-; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP16]], ptr [[TMP26]], align 8
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; DEFAULT-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; DEFAULT-NEXT:    store <8 x double> [[TMP3]], ptr [[TMP4]], align 8
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; DEFAULT-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; DEFAULT:       middle.block:
-; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; DEFAULT:       scalar.ph:
 ; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY:%.*]] ]
-; DEFAULT-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; DEFAULT-NEXT:    br label [[LOOP:%.*]]
 ; DEFAULT:       vector.scevcheck:
 ; DEFAULT-NEXT:    unreachable
@@ -810,7 +780,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; PRED-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
@@ -819,31 +789,31 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
 ; PRED-NEXT:    [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
 ; PRED-NEXT:    [[TMP8:%.*]] = sub i64 257, [[TMP7]]
 ; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 257, [[TMP7]]
 ; PRED-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 257)
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 257)
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PRED:       vector.body:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; PRED-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; PRED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
 ; PRED-NEXT:    [[TMP12:%.*]] = load i16, ptr [[SRC]], align 2
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP12]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP13:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; PRED-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x double>
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP12]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i64 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
 ; PRED-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv8f64.p0(<vscale x 8 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP10]])
-; PRED-NEXT:    [[TMP16:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; PRED-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 8 x i1> [[TMP16]], i32 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
+; PRED-NEXT:    [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
 ; PRED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
index 59da1e10fd2a07..f28f77bf1b1558 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -1,23 +1,23 @@
 ; REQUIRES: asserts
 ; RUN: opt -mtriple=aarch64 -mattr=+sve \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
 
 ; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
 
 ; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
 ; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
@@ -29,7 +29,7 @@
 ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
 
 ; VF-4: <4 x i32>
-; VF-VSCALE16: <vscale x 16 x i32>
+; VF-VSCALE4: <16 x i32>
 define void @test0(ptr %a, ptr %b, ptr %c) #0 {
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
index a84932a2290d67..e83eb729b521c2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -8,8 +8,8 @@
 ; (maximized bandwidth for i8 in the loop).
 define void @test0(ptr %a, ptr %b, ptr %c) #0 {
 ; CHECK: LV: Checking a loop in 'test0'
-; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 16
-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 16
+; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
 ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index a4861ad0b26196..7d2fc348480a09 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -145,7 +145,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; DEFAULT-NEXT:  iter.check:
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
 ; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; DEFAULT:       vector.memcheck:
@@ -155,72 +155,59 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; DEFAULT-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; DEFAULT:       vector.main.loop.iter.check:
-; DEFAULT-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP9]], 32
-; DEFAULT-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 0, [[TMP3]]
-; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
-; DEFAULT-NEXT:    [[N_MOD_VF1:%.*]] = urem i64 0, [[TMP5]]
-; DEFAULT-NEXT:    [[N_VEC1:%.*]] = sub i64 0, [[N_MOD_VF1]]
-; DEFAULT-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 32
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i16> poison, i16 [[X]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP8:%.*]] = trunc <vscale x 16 x i16> [[BROADCAST_SPLAT]] to <vscale x 16 x i8>
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT3]], <16 x i16> poison, <16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP7:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT4]] to <16 x i8>
 ; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DEFAULT:       vector.body:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; DEFAULT-NEXT:    [[TMP14:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]]
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP14]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP11:%.*]] = trunc <vscale x 16 x i64> [[BROADCAST_SPLAT3]] to <vscale x 16 x i8>
-; DEFAULT-NEXT:    [[TMP22:%.*]] = and <vscale x 16 x i8> [[TMP11]], [[TMP8]]
-; DEFAULT-NEXT:    [[TMP13:%.*]] = and <vscale x 16 x i8> [[TMP11]], [[TMP8]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TMP4]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP5:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8>
+; DEFAULT-NEXT:    [[TMP8:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; DEFAULT-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP16]], 16
-; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP23]]
-; DEFAULT-NEXT:    store <vscale x 16 x i8> [[TMP22]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
-; DEFAULT-NEXT:    store <vscale x 16 x i8> [...
[truncated]

@huntergr-arm huntergr-arm merged commit 091a235 into main Oct 18, 2024
8 of 9 checks passed
@huntergr-arm huntergr-arm deleted the revert-109671-sve-max-vector-bandwidth branch October 18, 2024 10:05
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants