[SLP]Correctly decide if the non-power-of-2 number of stores can be vectorized.

alexey-bataev · alexey-bataev · commit aeedab77b596 · 2024-08-29T12:40:31.000-07:00
Need to consider the maximum type size in the graph before doing attempt
for the vectorization of non-power-of-2 number of elements, which may be
  less than MinVF.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16476,8 +16476,9 @@ bool SLPVectorizerPass::vectorizeStores(
         // First try vectorizing with a non-power-of-2 VF. At the moment, only
         // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
         // lanes are used.
-        unsigned CandVF = Operands.size();
-        if (has_single_bit(CandVF + 1) && CandVF <= MaxRegVF)
+        unsigned CandVF =
+            std::clamp<unsigned>(Operands.size(), MaxVF, MaxRegVF);
+        if (has_single_bit(CandVF + 1))
           NonPowerOf2VF = CandVF;
       }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
@@ -9,43 +9,31 @@
 ;}
 
 define i32 @foo(ptr noalias nocapture %A, ptr noalias nocapture %B, float %T) {
-; NON-POW2-LABEL: @foo(
-; NON-POW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[TMP1]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> poison, float [[T:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> zeroinitializer
-; NON-POW2-NEXT:    [[TMP5:%.*]] = fmul <3 x float> [[TMP2]], [[TMP4]]
-; NON-POW2-NEXT:    [[TMP6:%.*]] = fpext <3 x float> [[TMP5]] to <3 x double>
-; NON-POW2-NEXT:    [[TMP7:%.*]] = fadd <3 x double> [[TMP6]], <double 4.000000e+00, double 5.000000e+00, double 6.000000e+00>
-; NON-POW2-NEXT:    [[TMP8:%.*]] = fptosi <3 x double> [[TMP7]] to <3 x i8>
-; NON-POW2-NEXT:    store <3 x i8> [[TMP8]], ptr [[A:%.*]], align 1
-; NON-POW2-NEXT:    ret i32 undef
-;
-; POW2-ONLY-LABEL: @foo(
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]]
-; POW2-ONLY-NEXT:    [[TMP4:%.*]] = fpext float [[TMP3]] to double
-; POW2-ONLY-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00
-; POW2-ONLY-NEXT:    [[TMP6:%.*]] = fptosi double [[TMP5]] to i8
-; POW2-ONLY-NEXT:    store i8 [[TMP6]], ptr [[A:%.*]], align 1
-; POW2-ONLY-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11
-; POW2-ONLY-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
-; POW2-ONLY-NEXT:    [[TMP9:%.*]] = fmul float [[TMP8]], [[T]]
-; POW2-ONLY-NEXT:    [[TMP10:%.*]] = fpext float [[TMP9]] to double
-; POW2-ONLY-NEXT:    [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00
-; POW2-ONLY-NEXT:    [[TMP12:%.*]] = fptosi double [[TMP11]] to i8
-; POW2-ONLY-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
-; POW2-ONLY-NEXT:    store i8 [[TMP12]], ptr [[TMP13]], align 1
-; POW2-ONLY-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12
-; POW2-ONLY-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4
-; POW2-ONLY-NEXT:    [[TMP16:%.*]] = fmul float [[TMP15]], [[T]]
-; POW2-ONLY-NEXT:    [[TMP17:%.*]] = fpext float [[TMP16]] to double
-; POW2-ONLY-NEXT:    [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00
-; POW2-ONLY-NEXT:    [[TMP19:%.*]] = fptosi double [[TMP18]] to i8
-; POW2-ONLY-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
-; POW2-ONLY-NEXT:    store i8 [[TMP19]], ptr [[TMP20]], align 1
-; POW2-ONLY-NEXT:    ret i32 undef
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext float [[TMP3]] to double
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fptosi double [[TMP5]] to i8
+; CHECK-NEXT:    store i8 [[TMP6]], ptr [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul float [[TMP8]], [[T]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fpext float [[TMP9]] to double
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00
+; CHECK-NEXT:    [[TMP12:%.*]] = fptosi double [[TMP11]] to i8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
+; CHECK-NEXT:    store i8 [[TMP12]], ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul float [[TMP15]], [[T]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fpext float [[TMP16]] to double
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00
+; CHECK-NEXT:    [[TMP19:%.*]] = fptosi double [[TMP18]] to i8
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
+; CHECK-NEXT:    store i8 [[TMP19]], ptr [[TMP20]], align 1
+; CHECK-NEXT:    ret i32 undef
 ;
   %1 = getelementptr inbounds float, ptr %B, i64 10
   %2 = load float, ptr %1, align 4