llvm · HanKuanChen · Apr 2, 2025 · Apr 2, 2025 · Apr 2, 2025 · gbossu
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11298,8 +11298,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     if (!E2 && InVectors.size() == 1) {
       unsigned VF = E1.getVectorFactor();
       if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
-        VF = std::max(VF,
-                      cast<FixedVectorType>(V1->getType())->getNumElements());
+        VF = std::max(VF, getVF(V1));
       } else {
         const auto *E = cast<const TreeEntry *>(InVectors.front());
         VF = std::max(VF, E->getVectorFactor());

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-estimateNodesPermuteCost.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-estimateNodesPermuteCost.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -passes=slp-vectorizer -S -slp-revec < %s | FileCheck %s
+
+define i32 @test1(<4 x float> %0, <4 x float> %1) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr null, i64 288
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr null, i64 304
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr null, i64 416
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr null, i64 432
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr null, i64 256
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr null, i64 272
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr null, i64 288
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr null, i64 304
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x float>, ptr [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x float>, ptr [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <4 x float> [[TMP10]], [[TMP0:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <4 x float> [[TMP11]], [[TMP0]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul <4 x float> [[TMP12]], [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <4 x float> [[TMP13]], [[TMP0]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fsub <4 x float> [[TMP14]], [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fsub <4 x float> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = fsub <4 x float> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fsub <4 x float> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = fmul <4 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <4 x float> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x float> [[TMP18]], [[TMP0]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = fadd <4 x float> [[TMP20]], [[TMP0]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fadd <4 x float> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    store <4 x float> [[TMP24]], ptr [[TMP6]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP25]], ptr [[TMP7]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP26]], ptr [[TMP8]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[TMP9]], align 16
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %2 = getelementptr i8, ptr null, i64 288
+  %3 = getelementptr i8, ptr null, i64 304
+  %4 = getelementptr i8, ptr null, i64 416
+  %5 = getelementptr i8, ptr null, i64 432
+  %6 = getelementptr i8, ptr null, i64 256
+  %7 = getelementptr i8, ptr null, i64 272
+  %8 = getelementptr i8, ptr null, i64 288
+  %9 = getelementptr i8, ptr null, i64 304
+  %10 = load <4 x float>, ptr %2, align 16
+  %11 = load <4 x float>, ptr %3, align 16
+  %12 = load <4 x float>, ptr %4, align 16
+  %13 = load <4 x float>, ptr %5, align 16
+  %14 = fmul <4 x float> %10, %0
+  %15 = fmul <4 x float> %11, %0
+  %16 = fmul <4 x float> %12, %0
+  %17 = fmul <4 x float> %13, %0
+  %18 = fsub <4 x float> %14, %1
+  %19 = fsub <4 x float> %15, zeroinitializer
+  %20 = fsub <4 x float> %16, %1
+  %21 = fsub <4 x float> %17, zeroinitializer
+  %22 = fmul <4 x float> %11, zeroinitializer
+  %23 = fmul <4 x float> %13, zeroinitializer
+  %24 = fadd <4 x float> %18, %0
+  %25 = fadd <4 x float> %19, zeroinitializer
+  %26 = fadd <4 x float> %20, %0
+  %27 = fadd <4 x float> %21, zeroinitializer
+  store <4 x float> %24, ptr %6, align 16
+  store <4 x float> %25, ptr %7, align 16
+  store <4 x float> %26, ptr %8, align 16
+  store <4 x float> %27, ptr %9, align 16
+  ret i32 0
+}