[SLP]Adjust NumberOfParts value for adjusted number of buildvector scalars

alexey-bataev · alexey-bataev · commit 1d5fbe83c3bc · 2025-01-28T05:45:13.000-08:00
Need to adjust NumParts value, when GatheredScalars scalars are adjusted
after extractelements analysis, to fix compiler crash
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14898,6 +14898,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
             Resized = true;
             GatheredScalars.append(VF - GatheredScalars.size(),
                                    PoisonValue::get(OrigScalarTy));
+            NumParts = TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF));
+            if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
+                VecTy->getNumElements() % NumParts != 0 ||
+                !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
+                                          VecTy->getNumElements() / NumParts))
+              NumParts = 1;
           }
       }
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
@@ -0,0 +1,269 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s
+
+define <16 x half> @test(i32 %0, float %1, i32 %2) {
+; CHECK-LABEL: define <16 x half> @test(
+; CHECK-SAME: i32 [[TMP0:%.*]], float [[TMP1:%.*]], i32 [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00>, float [[TMP1]], i32 13
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ogt float [[TMP9]], 0.000000e+00
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP5]], i32 10
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float 0.000000e+00 to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult i32 0, 0
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ult i32 0, 0
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult i32 0, 0
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 0, 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i32> [[TMP5]], i32 4
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast float 0.000000e+00 to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP25]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ult <16 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP27]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = sitofp <16 x i32> [[TMP28]] to <16 x float>
+; CHECK-NEXT:    [[TMP30:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> [[TMP29]], <16 x float> zeroinitializer, <16 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP31:%.*]] = fadd <16 x float> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = call <12 x i1> @llvm.vector.insert.v12i1.v2i1(<12 x i1> poison, <2 x i1> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <12 x i1> [[TMP32]], <12 x i1> <i1 poison, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <12 x i32> <i32 0, i32 13, i32 14, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <12 x i1> [[TMP33]], <12 x i1> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 0, i32 10, i32 11, i32 0>
+; CHECK-NEXT:    [[TMP35:%.*]] = select <16 x i1> [[TMP34]], <16 x float> zeroinitializer, <16 x float> [[TMP31]]
+; CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x float> [[TMP35]] to <16 x i32>
+; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i32> [[TMP36]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[TMP37]] to <16 x float>
+; CHECK-NEXT:    [[TMP39:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <2 x float> [[TMP6]], i64 14)
+; CHECK-NEXT:    [[TMP40:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> zeroinitializer, <16 x float> [[TMP38]], <16 x float> [[TMP39]])
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x float> [[TMP29]], i32 0
+; CHECK-NEXT:    [[TMP42:%.*]] = fcmp olt float [[TMP41]], 0.000000e+00
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <16 x float> [[TMP29]], i32 14
+; CHECK-NEXT:    [[TMP44:%.*]] = fcmp ogt float [[TMP43]], 0.000000e+00
+; CHECK-NEXT:    [[TMP45:%.*]] = fcmp olt float [[TMP43]], 0.000000e+00
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <16 x float> [[TMP29]], i32 13
+; CHECK-NEXT:    [[TMP47:%.*]] = fcmp ogt float [[TMP46]], 0.000000e+00
+; CHECK-NEXT:    [[TMP48:%.*]] = fcmp olt float [[TMP46]], 0.000000e+00
+; CHECK-NEXT:    [[TMP49:%.*]] = fcmp olt float [[TMP41]], 0.000000e+00
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <16 x float> [[TMP29]], i32 1
+; CHECK-NEXT:    [[TMP51:%.*]] = fcmp ogt float [[TMP50]], 0.000000e+00
+; CHECK-NEXT:    [[TMP52:%.*]] = fcmp oeq <16 x float> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    ret <16 x half> zeroinitializer
+;
+  %4 = bitcast float 0.000000e+00 to i32
+  %5 = fcmp olt float 0.000000e+00, 0.000000e+00
+  %6 = icmp ult i32 %4, 0
+  %7 = select i1 %6, i32 0, i32 0
+  %8 = sitofp i32 %7 to float
+  %9 = tail call float @llvm.fmuladd.f32(float %8, float 0.000000e+00, float 0.000000e+00)
+  %10 = fadd float %9, 0.000000e+00
+  %11 = select i1 %5, float 0.000000e+00, float %10
+  %12 = bitcast float %11 to i32
+  %13 = and i32 %12, 0
+  %14 = bitcast i32 %13 to float
+  %15 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00)
+  %16 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %14, float %15)
+  %17 = fcmp oeq float %16, 0.000000e+00
+  %18 = fcmp olt float %8, 0.000000e+00
+  %19 = icmp ugt i32 %2, 0
+  %20 = bitcast float 0.000000e+00 to i32
+  %21 = icmp eq i32 %0, %0
+  %22 = icmp ult i32 %20, 0
+  %23 = select i1 %22, i32 0, i32 0
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fmuladd.f32(float %24, float 0.000000e+00, float 0.000000e+00)
+  %26 = fadd float %25, 0.000000e+00
+  %27 = select i1 false, float 0.000000e+00, float %26
+  %28 = bitcast float %27 to i32
+  %29 = and i32 %28, 0
+  %30 = bitcast i32 %29 to float
+  %31 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00)
+  %32 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %30, float %31)
+  %33 = fcmp ogt float %24, 0.000000e+00
+  %34 = fcmp oeq float %32, 0.000000e+00
+  %35 = fcmp ogt float %31, 0.000000e+00
+  %36 = fcmp olt float %24, 0.000000e+00
+  %37 = bitcast float %1 to i32
+  %38 = icmp ult i32 %37, 0
+  %39 = select i1 %38, i32 0, i32 0
+  %40 = sitofp i32 %39 to float
+  %41 = tail call float @llvm.fmuladd.f32(float %40, float 0.000000e+00, float 0.000000e+00)
+  %42 = fadd float %41, 0.000000e+00
+  %43 = select i1 false, float 0.000000e+00, float %42
+  %44 = bitcast float %43 to i32
+  %45 = and i32 %44, 0
+  %46 = bitcast i32 %45 to float
+  %47 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %46, float 0.000000e+00)
+  %48 = fadd float 0.000000e+00, 0.000000e+00
+  %49 = fcmp ogt float %40, 0.000000e+00
+  %50 = fcmp oeq float %47, 0.000000e+00
+  %51 = fcmp ogt float %48, 0.000000e+00
+  %52 = fcmp olt float %40, 0.000000e+00
+  %53 = icmp eq i32 0, 0
+  %54 = bitcast float 0.000000e+00 to i32
+  %55 = icmp eq i32 0, 0
+  %56 = icmp ult i32 %54, 0
+  %57 = select i1 %56, i32 0, i32 0
+  %58 = sitofp i32 %57 to float
+  %59 = tail call float @llvm.fmuladd.f32(float %58, float 0.000000e+00, float 0.000000e+00)
+  %60 = fadd float %59, 0.000000e+00
+  %61 = select i1 %5, float 0.000000e+00, float %60
+  %62 = bitcast float %61 to i32
+  %63 = and i32 %62, 0
+  %64 = bitcast i32 %63 to float
+  %65 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %64, float 0.000000e+00)
+  %66 = fcmp oeq float %65, 0.000000e+00
+  %67 = bitcast float 0.000000e+00 to i32
+  %68 = icmp eq i32 %0, 0
+  %69 = icmp ult i32 %67, 0
+  %70 = select i1 %69, i32 0, i32 0
+  %71 = sitofp i32 %70 to float
+  %72 = tail call float @llvm.fmuladd.f32(float %71, float 0.000000e+00, float 0.000000e+00)
+  %73 = fadd float %72, 0.000000e+00
+  %74 = select i1 false, float 0.000000e+00, float %73
+  %75 = bitcast float %74 to i32
+  %76 = and i32 %75, 0
+  %77 = bitcast i32 %76 to float
+  %78 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %77, float 0.000000e+00)
+  %79 = fadd float 0.000000e+00, 0.000000e+00
+  %80 = fcmp oeq float %78, 0.000000e+00
+  %81 = fcmp ogt float %79, 0.000000e+00
+  %82 = icmp eq i32 %0, 0
+  %83 = bitcast float 0.000000e+00 to i32
+  %84 = icmp eq i32 %83, 0
+  %85 = icmp ult i32 %83, 0
+  %86 = select i1 %85, i32 0, i32 0
+  %87 = sitofp i32 %86 to float
+  %88 = tail call float @llvm.fmuladd.f32(float %87, float 0.000000e+00, float 0.000000e+00)
+  %89 = fadd float %88, 0.000000e+00
+  %90 = select i1 false, float 0.000000e+00, float %89
+  %91 = bitcast float %90 to i32
+  %92 = and i32 %91, 0
+  %93 = bitcast i32 %92 to float
+  %94 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %93, float 0.000000e+00)
+  %95 = fcmp oeq float %94, 0.000000e+00
+  %96 = bitcast float 0.000000e+00 to i32
+  %97 = bitcast float 0.000000e+00 to i32
+  %98 = icmp ult i32 %97, 0
+  %99 = select i1 %98, i32 0, i32 0
+  %100 = sitofp i32 %99 to float
+  %101 = tail call float @llvm.fmuladd.f32(float %100, float 0.000000e+00, float 0.000000e+00)
+  %102 = fadd float %101, 0.000000e+00
+  %103 = select i1 false, float 0.000000e+00, float %102
+  %104 = bitcast float %103 to i32
+  %105 = and i32 %104, 0
+  %106 = bitcast i32 %105 to float
+  %107 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %106, float 0.000000e+00)
+  %108 = fcmp oeq float %107, 0.000000e+00
+  %109 = icmp eq i32 %96, 0
+  %110 = icmp eq i32 %0, 0
+  %111 = icmp ult i32 0, 0
+  %112 = bitcast float 0.000000e+00 to i32
+  %113 = icmp ult i32 %112, 0
+  %114 = select i1 %113, i32 0, i32 0
+  %115 = sitofp i32 %114 to float
+  %116 = tail call float @llvm.fmuladd.f32(float %115, float 0.000000e+00, float 0.000000e+00)
+  %117 = fadd float %116, 0.000000e+00
+  %118 = select i1 false, float 0.000000e+00, float %117
+  %119 = bitcast float %118 to i32
+  %120 = and i32 %119, 0
+  %121 = bitcast i32 %120 to float
+  %122 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %121, float 0.000000e+00)
+  %123 = fadd float 0.000000e+00, 0.000000e+00
+  %124 = fcmp oeq float %122, 0.000000e+00
+  %125 = fcmp ogt float %123, 0.000000e+00
+  %126 = icmp ult i32 0, 0
+  %127 = bitcast float 0.000000e+00 to i32
+  %128 = icmp ult i32 %127, 0
+  %129 = select i1 %128, i32 0, i32 0
+  %130 = sitofp i32 %129 to float
+  %131 = tail call float @llvm.fmuladd.f32(float %130, float 0.000000e+00, float 0.000000e+00)
+  %132 = fadd float %131, 0.000000e+00
+  %133 = select i1 false, float 0.000000e+00, float %132
+  %134 = bitcast float %133 to i32
+  %135 = and i32 %134, 0
+  %136 = bitcast i32 %135 to float
+  %137 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %136, float 0.000000e+00)
+  %138 = fcmp oeq float %137, 0.000000e+00
+  %139 = icmp ult i32 0, 0
+  %140 = bitcast float 0.000000e+00 to i32
+  %141 = icmp eq i32 0, 0
+  %142 = icmp ult i32 %140, 0
+  %143 = select i1 %142, i32 0, i32 0
+  %144 = sitofp i32 %143 to float
+  %145 = tail call float @llvm.fmuladd.f32(float %144, float 0.000000e+00, float 0.000000e+00)
+  %146 = fadd float %145, 0.000000e+00
+  %147 = select i1 false, float 0.000000e+00, float %146
+  %148 = bitcast float %147 to i32
+  %149 = and i32 %148, 0
+  %150 = bitcast i32 %149 to float
+  %151 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %150, float 0.000000e+00)
+  %152 = fcmp oeq float %151, 0.000000e+00
+  %153 = fcmp olt float 0.000000e+00, 0.000000e+00
+  %154 = select i1 %153, float 0.000000e+00, float %10
+  %155 = bitcast float %154 to i32
+  %156 = and i32 %155, 0
+  %157 = bitcast i32 %156 to float
+  %158 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %157, float 0.000000e+00)
+  %159 = fcmp oeq float %158, 0.000000e+00
+  %160 = bitcast float 0.000000e+00 to i32
+  %161 = icmp eq i32 %160, 0
+  %162 = icmp ult i32 %160, 0
+  %163 = select i1 %162, i32 0, i32 0
+  %164 = sitofp i32 %163 to float
+  %165 = tail call float @llvm.fmuladd.f32(float %164, float 0.000000e+00, float 0.000000e+00)
+  %166 = fadd float %165, 0.000000e+00
+  %167 = select i1 false, float 0.000000e+00, float %166
+  %168 = bitcast float %167 to i32
+  %169 = and i32 %168, 0
+  %170 = bitcast i32 %169 to float
+  %171 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %170, float 0.000000e+00)
+  %172 = fcmp oeq float %171, 0.000000e+00
+  %173 = tail call float @llvm.fmuladd.f32(float %8, float 0.000000e+00, float 0.000000e+00)
+  %174 = fadd float %173, 0.000000e+00
+  %175 = select i1 %5, float 0.000000e+00, float %174
+  %176 = bitcast float %175 to i32
+  %177 = and i32 %176, 0
+  %178 = bitcast i32 %177 to float
+  %179 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %178, float 0.000000e+00)
+  %180 = fadd float 0.000000e+00, 0.000000e+00
+  %181 = fcmp oeq float %179, 0.000000e+00
+  %182 = fcmp ogt float %180, 0.000000e+00
+  %183 = fcmp olt float %8, 0.000000e+00
+  %184 = bitcast float 0.000000e+00 to i32
+  %185 = icmp eq i32 %0, %0
+  %186 = icmp ult i32 %184, 0
+  %187 = select i1 %186, i32 0, i32 0
+  %188 = sitofp i32 %187 to float
+  %189 = tail call float @llvm.fmuladd.f32(float %188, float 0.000000e+00, float 0.000000e+00)
+  %190 = fadd float %189, 0.000000e+00
+  %191 = select i1 %5, float 0.000000e+00, float %190
+  %192 = bitcast float %191 to i32
+  %193 = and i32 %192, 0
+  %194 = bitcast i32 %193 to float
+  %195 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %194, float 0.000000e+00)
+  %196 = fcmp oeq float %195, 0.000000e+00
+  %197 = bitcast float 0.000000e+00 to i32
+  %198 = icmp eq i32 %197, 0
+  %199 = icmp ult i32 %197, 0
+  %200 = select i1 %199, i32 0, i32 0
+  %201 = sitofp i32 %200 to float
+  %202 = tail call float @llvm.fmuladd.f32(float %201, float 0.000000e+00, float 0.000000e+00)
+  %203 = fadd float %202, 0.000000e+00
+  %204 = select i1 false, float 0.000000e+00, float %203
+  %205 = bitcast float %204 to i32
+  %206 = and i32 %205, 0
+  %207 = bitcast i32 %206 to float
+  %208 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %207, float 0.000000e+00)
+  %209 = fcmp oeq float %208, 0.000000e+00
+  %210 = fcmp ogt float %201, 0.000000e+00
+  %211 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %14, float 0.000000e+00)
+  %212 = fcmp oeq float %211, 0.000000e+00
+  ret <16 x half> zeroinitializer
+}
+

Original file line number	Diff line number	Diff line change
`@@ -14898,6 +14898,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry E, Type ScalarTy,`
`14898`	`14898`	`Resized = true;`
`14899`	`14899`	`GatheredScalars.append(VF - GatheredScalars.size(),`
`14900`	`14900`	`PoisonValue::get(OrigScalarTy));`
	`14901`	`+ NumParts = TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF));`
	`14902`	`+ if (NumParts == 0 \|\| NumParts >= GatheredScalars.size() \|\|`
	`14903`	`+ VecTy->getNumElements() % NumParts != 0 \|\|`
	`14904`	`+ !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),`
	`14905`	`+ VecTy->getNumElements() / NumParts))`
	`14906`	`+ NumParts = 1;`
`14901`	`14907`	`}`
`14902`	`14908`	`}`
`14903`	`14909`	`}`