[SLP][REVEC] VectorValuesAndScales should be supported by REVEC. (#135762)

HanKuanChen · web-flow · commit bcfc9f452983 · 2025-04-15T23:03:55.000+08:00
We should align REVEC with the SLP algorithm as closely as possible. For example, by applying REVEC-specific handling when calling IRBuilder's Create methods, performing cost analysis via TTI, and expanding shuffle masks using transformScalarShuffleIndicesToVector. reference commit: 3b18d47
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22144,53 +22144,16 @@ class HorizontalReduction {
         }
 
         Type *ScalarTy = VL.front()->getType();
-        if (isa<FixedVectorType>(ScalarTy)) {
-          assert(SLPReVec && "FixedVectorType is not expected.");
-          unsigned ScalarTyNumElements = getNumElements(ScalarTy);
-          Value *ReducedSubTree = PoisonValue::get(
-              getWidenedType(ScalarTy->getScalarType(), ScalarTyNumElements));
-          for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
-            // Do reduction for each lane.
-            // e.g., do reduce add for
-            // VL[0] = <4 x Ty> <a, b, c, d>
-            // VL[1] = <4 x Ty> <e, f, g, h>
-            // Lane[0] = <2 x Ty> <a, e>
-            // Lane[1] = <2 x Ty> <b, f>
-            // Lane[2] = <2 x Ty> <c, g>
-            // Lane[3] = <2 x Ty> <d, h>
-            // result[0] = reduce add Lane[0]
-            // result[1] = reduce add Lane[1]
-            // result[2] = reduce add Lane[2]
-            // result[3] = reduce add Lane[3]
-            SmallVector<int, 16> Mask =
-                createStrideMask(I, ScalarTyNumElements, VL.size());
-            Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
-            Value *Val =
-                createSingleOp(Builder, *TTI, Lane,
-                               OptReusedScalars && SameScaleFactor
-                                   ? SameValuesCounter.front().second
-                                   : 1,
-                               Lane->getType()->getScalarType() !=
-                                       VL.front()->getType()->getScalarType()
-                                   ? V.isSignedMinBitwidthRootNode()
-                                   : true,
-                               RdxRootInst->getType());
-            ReducedSubTree =
-                Builder.CreateInsertElement(ReducedSubTree, Val, I);
-          }
-          VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
-        } else {
-          Type *VecTy = VectorizedRoot->getType();
-          Type *RedScalarTy = VecTy->getScalarType();
-          VectorValuesAndScales.emplace_back(
-              VectorizedRoot,
-              OptReusedScalars && SameScaleFactor
-                  ? SameValuesCounter.front().second
-                  : 1,
-              RedScalarTy != ScalarTy->getScalarType()
-                  ? V.isSignedMinBitwidthRootNode()
-                  : true);
-        }
+        Type *VecTy = VectorizedRoot->getType();
+        Type *RedScalarTy = VecTy->getScalarType();
+        VectorValuesAndScales.emplace_back(
+            VectorizedRoot,
+            OptReusedScalars && SameScaleFactor
+                ? SameValuesCounter.front().second
+                : 1,
+            RedScalarTy != ScalarTy->getScalarType()
+                ? V.isSignedMinBitwidthRootNode()
+                : true);
 
         // Count vectorized reduced values to exclude them from final reduction.
         for (Value *RdxVal : VL) {
@@ -22363,9 +22326,35 @@ class HorizontalReduction {
   Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
                         Value *Vec, unsigned Scale, bool IsSigned,
                         Type *DestTy) {
-    Value *Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
-    if (Rdx->getType() != DestTy->getScalarType())
-      Rdx = Builder.CreateIntCast(Rdx, DestTy->getScalarType(), IsSigned);
+    Value *Rdx;
+    if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
+      unsigned DestTyNumElements = getNumElements(VecTy);
+      unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
+      Rdx = PoisonValue::get(
+          getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
+      for (unsigned I : seq<unsigned>(DestTyNumElements)) {
+        // Do reduction for each lane.
+        // e.g., do reduce add for
+        // VL[0] = <4 x Ty> <a, b, c, d>
+        // VL[1] = <4 x Ty> <e, f, g, h>
+        // Lane[0] = <2 x Ty> <a, e>
+        // Lane[1] = <2 x Ty> <b, f>
+        // Lane[2] = <2 x Ty> <c, g>
+        // Lane[3] = <2 x Ty> <d, h>
+        // result[0] = reduce add Lane[0]
+        // result[1] = reduce add Lane[1]
+        // result[2] = reduce add Lane[2]
+        // result[3] = reduce add Lane[3]
+        SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
+        Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
+        Rdx = Builder.CreateInsertElement(
+            Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
+      }
+    } else {
+      Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
+    }
+    if (Rdx->getType() != DestTy)
+      Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
     // Improved analysis for add/fadd/xor reductions with same scale
     // factor for all operands of reductions. We can emit scalar ops for
     // them instead.
@@ -22432,30 +22421,32 @@ class HorizontalReduction {
     case RecurKind::FMul: {
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
       if (!AllConsts) {
-        if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
-          assert(SLPReVec && "FixedVectorType is not expected.");
-          unsigned ScalarTyNumElements = VecTy->getNumElements();
-          for (unsigned I : seq<unsigned>(ReducedVals.size())) {
-            VectorCost += TTI->getShuffleCost(
-                TTI::SK_PermuteSingleSrc, VectorTy,
-                createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
-            VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
-                                                          CostKind);
-          }
-          VectorCost += TTI->getScalarizationOverhead(
-              VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
-              /*Extract*/ false, TTI::TCK_RecipThroughput);
-        } else if (DoesRequireReductionOp) {
-          Type *RedTy = VectorTy->getElementType();
-          auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
-              std::make_pair(RedTy, true));
-          if (RType == RedTy) {
-            VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
-                                                         FMF, CostKind);
+        if (DoesRequireReductionOp) {
+          if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
+            assert(SLPReVec && "FixedVectorType is not expected.");
+            unsigned ScalarTyNumElements = VecTy->getNumElements();
+            for (unsigned I : seq<unsigned>(ReducedVals.size())) {
+              VectorCost += TTI->getShuffleCost(
+                  TTI::SK_PermuteSingleSrc, VectorTy,
+                  createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
+              VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
+                                                            FMF, CostKind);
+            }
+            VectorCost += TTI->getScalarizationOverhead(
+                VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
+                /*Extract*/ false, TTI::TCK_RecipThroughput);
           } else {
-            VectorCost = TTI->getExtendedReductionCost(
-                RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
-                FMF, CostKind);
+            Type *RedTy = VectorTy->getElementType();
+            auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
+                std::make_pair(RedTy, true));
+            if (RType == RedTy) {
+              VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
+                                                           FMF, CostKind);
+            } else {
+              VectorCost = TTI->getExtendedReductionCost(
+                  RdxOpcode, !IsSigned, RedTy,
+                  getWidenedType(RType, ReduxWidth), FMF, CostKind);
+            }
           }
         } else {
           Type *RedTy = VectorTy->getElementType();
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll
@@ -44,16 +44,16 @@ define void @e(<4 x i16> %0) {
 ; THRESH-NEXT:    [[TMP13:%.*]] = icmp sgt <16 x i16> [[TMP12]], [[TMP7]]
 ; THRESH-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; THRESH-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
+; THRESH-NEXT:    [[TMP23:%.*]] = insertelement <4 x i1> poison, i1 [[TMP15]], i64 0
 ; THRESH-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; THRESH-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; THRESH-NEXT:    [[TMP24:%.*]] = insertelement <4 x i1> [[TMP23]], i1 [[TMP17]], i64 1
 ; THRESH-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 ; THRESH-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
+; THRESH-NEXT:    [[TMP22:%.*]] = insertelement <4 x i1> [[TMP24]], i1 [[TMP19]], i64 2
 ; THRESH-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 ; THRESH-NEXT:    [[TMP21:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP20]])
-; THRESH-NEXT:    [[TMP22:%.*]] = insertelement <4 x i1> poison, i1 [[TMP15]], i32 0
-; THRESH-NEXT:    [[TMP23:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP17]], i32 1
-; THRESH-NEXT:    [[TMP24:%.*]] = insertelement <4 x i1> [[TMP23]], i1 [[TMP19]], i32 2
-; THRESH-NEXT:    [[TMP25:%.*]] = insertelement <4 x i1> [[TMP24]], i1 [[TMP21]], i32 3
+; THRESH-NEXT:    [[TMP25:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP21]], i64 3
 ; THRESH-NEXT:    [[TMP26]] = zext <4 x i1> [[TMP25]] to <4 x i32>
 ; THRESH-NEXT:    br label [[VECTOR_BODY]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 < %s | FileCheck %s
+
+define <4 x i16> @test() {
+; CHECK-LABEL: define <4 x i16> @test() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> poison, <4 x i16> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP0]], <4 x i16> zeroinitializer, i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i16> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> poison, <4 x i16> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP4]], <4 x i16> zeroinitializer, i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP6]], <4 x i16> zeroinitializer, i64 8)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP7]], <4 x i16> zeroinitializer, i64 12)
+; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i16> [[TMP8]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <16 x i16> [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]])
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[TMP28:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP27]])
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP28]], i64 1
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[TMP31:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP31]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP33]])
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP34]], i64 3
+; CHECK-NEXT:    [[RDX_OP:%.*]] = or <16 x i16> [[TMP11]], [[TMP9]]
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]])
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]])
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1
+; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]])
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]])
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3
+; CHECK-NEXT:    [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer
+; CHECK-NEXT:    [[OP_RDX11:%.*]] = or <4 x i16> [[OP_RDX9]], [[TMP35]]
+; CHECK-NEXT:    ret <4 x i16> [[OP_RDX11]]
+;
+entry:
+  %subi = add <4 x i16> zeroinitializer, zeroinitializer
+  %sub40.i = add <4 x i16> %subi, zeroinitializer
+  %sub41.i = add <4 x i16> %subi, zeroinitializer
+  %sub42.i = add <4 x i16> %subi, zeroinitializer
+  %sub43.i = add <4 x i16> %subi, zeroinitializer
+  %sub44.i = add <4 x i16> %subi, zeroinitializer
+  %sub45.i = add <4 x i16> %subi, zeroinitializer
+  %sub46.i = add <4 x i16> zeroinitializer, zeroinitializer
+  %sub47.i = add <4 x i16> zeroinitializer, zeroinitializer
+  %sub48.i = add <4 x i16> zeroinitializer, zeroinitializer
+  %sub49.i = add <4 x i16> zeroinitializer, zeroinitializer
+  %or40.i = or <4 x i16> %sub40.i, %sub41.i
+  %or41.i = or <4 x i16> %or40.i, %sub42.i
+  %or42.i = or <4 x i16> %or41.i, %sub43.i
+  %or43.i = or <4 x i16> %or42.i, %sub44.i
+  %or44.i = or <4 x i16> %or43.i, %sub45.i
+  %or45.i = or <4 x i16> %or44.i, %sub46.i
+  %or46.i = or <4 x i16> %or45.i, %sub47.i
+  %or47.i = or <4 x i16> %or46.i, %sub48.i
+  %or48.i = or <4 x i16> %or47.i, %sub49.i
+  %or50.i = or <4 x i16> %or48.i, %subi
+  %subii = add <4 x i16> zeroinitializer, zeroinitializer
+  %subi16.i = add <4 x i16> %subii, zeroinitializer
+  %subi17.i = add <4 x i16> %subii, zeroinitializer
+  %0 = or <4 x i16> %subi16.i, %subi17.i
+  %1 = or <4 x i16> %0, %or50.i
+  ret <4 x i16> %1
+}