Improvements to vector elements insertion costs.

JonPsson1 · JonPsson1 · commit 594147fd8e3a · 2024-10-25T17:44:38.000+02:00
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -469,6 +469,33 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
 }
 
+InstructionCost SystemZTTIImpl::
+getScalarizationOverhead(VectorType *Ty,
+                         const APInt &DemandedElts,
+                         bool Insert, bool Extract,
+                         TTI::TargetCostKind CostKind) {
+  unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+  InstructionCost Cost = 0;
+
+  if (Insert && Ty->isIntOrIntVectorTy(64)) {
+    // VLVGP will insert two GPRs with one instruction.
+    InstructionCost CurrVectorCost = 0;
+    for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+      if (DemandedElts[Idx])
+        ++CurrVectorCost;
+      if (Idx % 2 == 1) {
+        Cost += std::min(InstructionCost(1), CurrVectorCost);
+        CurrVectorCost = 0;
+      }
+    }
+    Insert = false;
+  }
+
+  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
+                                          Extract, CostKind);
+  return Cost;
+}
+
 // Return the bit size for the scalar type or vector element
 // type. getScalarSizeInBits() returns 0 for a pointer type.
 static unsigned getScalarSizeInBits(Type *Ty) {
@@ -610,7 +637,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     if (DivRemConst) {
       SmallVector<Type *> Tys(Args.size(), Ty);
       return VF * DivMulSeqCost +
-             getScalarizationOverhead(VTy, Args, Tys, CostKind);
+             BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
     }
     if ((SignedDivRem || UnsignedDivRem) && VF > 4)
       // Temporary hack: disable high vectorization factors with integer
@@ -637,7 +664,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
         SmallVector<Type *> Tys(Args.size(), Ty);
         InstructionCost Cost =
             (VF * ScalarCost) +
-            getScalarizationOverhead(VTy, Args, Tys, CostKind);
+            BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
         // FIXME: VF 2 for these FP operations are currently just as
         // expensive as for VF 4.
         if (VF == 2)
@@ -655,8 +682,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     // There is no native support for FRem.
     if (Opcode == Instruction::FRem) {
       SmallVector<Type *> Tys(Args.size(), Ty);
-      InstructionCost Cost = (VF * LIBCALL_COST) +
-                             getScalarizationOverhead(VTy, Args, Tys, CostKind);
+      InstructionCost Cost =
+          (VF * LIBCALL_COST) +
+          BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
       // FIXME: VF 2 for float is currently just as expensive as for VF 4.
       if (VF == 2 && ScalarBits == 32)
         Cost *= 2;
@@ -976,10 +1004,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
         NeedsExtracts = false;
 
-      TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
-                                          NeedsExtracts, CostKind);
-      TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
-                                          /*Extract*/ false, CostKind);
+      TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                                 NeedsExtracts, CostKind);
+      TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
+                                                 /*Extract*/ false, CostKind);
 
       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -991,8 +1019,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     if (Opcode == Instruction::FPTrunc) {
       if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
         return VF /*ldxbr/lexbr*/ +
-               getScalarizationOverhead(DstVecTy, /*Insert*/ true,
-                                        /*Extract*/ false, CostKind);
+               BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
+                                               /*Extract*/ false, CostKind);
       else // double -> float
         return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
     }
@@ -1005,8 +1033,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
         return VF * 2;
       }
       // -> fp128.  VF * lxdb/lxeb + extraction of elements.
-      return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
-                                           /*Extract*/ true, CostKind);
+      return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                                  /*Extract*/ true, CostKind);
     }
   }
 
@@ -1115,10 +1143,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    TTI::TargetCostKind CostKind,
                                                    unsigned Index, Value *Op0,
                                                    Value *Op1) {
-  // vlvgp will insert two grs into a vector register, so only count half the
-  // number of instructions.
-  if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
-    return ((Index % 2 == 0) ? 1 : 0);
+  if (Opcode == Instruction::InsertElement) {
+    // Vector Element Load.
+    if (Op1 != nullptr && Op1->hasOneUse() && isa<LoadInst>(Op1))
+      return 0;
+
+    // vlvgp will insert two grs into a vector register, so count half the
+    // number of instructions as an estimate when we don't have the full
+    // picture (as in getScalarizationOverhead()).
+    if (Val->isIntOrIntVectorTy(64))
+      return ((Index % 2 == 0) ? 1 : 0);
+  }
 
   if (Opcode == Instruction::ExtractElement) {
     int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -81,6 +81,10 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool prefersVectorizedAddressing() { return false; }
   bool LSRWithInstrQueries() { return true; }
+  InstructionCost getScalarizationOverhead(VectorType *Ty,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind);
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3033,8 +3033,8 @@ class BoUpSLP {
       unsigned NumParts, bool ForOrder = false);
 
   /// \returns the scalarization cost for this list of values. Assuming that
-  /// this subtree gets vectorized, we may need to extract the values from the
-  /// roots. This method calculates the cost of extracting the values.
+  /// this subtree gets vectorized, we may need to insert the values from the
+  /// roots. This method calculates the cost of inserting the values.
   /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
   InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
                                 Type *ScalarTy) const;
@@ -13013,7 +13013,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
               TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
               I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
     } else {
-      Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
+      // Add insertion costs for all elements, but not for loads that can be
+      // loaded directly into a vector element for free.
+      APInt FreeEltLoads = APInt::getZero(VL.size());
+      if (TTI->supportsEfficientVectorElementLoadStore())
+        for (unsigned I = 0, E = VL.size(); I < E; ++I)
+          if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
+            FreeEltLoads.setBit(I);
+      APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;
+      Cost = TTI->getScalarizationOverhead(VecTy, DemandedElts,
                                            /*Insert*/ true,
                                            /*Extract*/ false, CostKind);
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -1,8 +1,4 @@
-; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
-; RUN:   -pass-remarks-output=%t | FileCheck %s
-; RUN: cat %t | FileCheck -check-prefix=REMARK %s
-;
-; NB! This is a pre-commit version (for #112491) with current codegen and remarks.
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
 ;
 ; Test functions that (at least currently) only gets vectorized if the
 ; insertion cost for an element load is counted as free.
@@ -11,19 +7,8 @@
 ; getGatherCost().
 define void @fun0(ptr nocapture %0, double %1) {
 ; CHECK-LABEL: define void @fun0(
-; CHECK:         fmul double
-; CHECK:         call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.sqrt.f64(
-; CHECK:         fmul double
-; CHECK:         call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.sqrt.f64(
-;
-; REMARK-LABEL: Function: fun0
-; REMARK: Args:
-; REMARK-NEXT: - String:          'List vectorization was possible but not beneficial with cost '
-; REMARK-NEXT: - Cost:            '0'
+; CHECK:    fmul <2 x double>
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
 
   %3 = fmul double %1, 2.000000e+00
   %4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
@@ -39,36 +24,18 @@ define void @fun0(ptr nocapture %0, double %1) {
   ret void
 }
 
+
 ; This function needs the element-load to be recognized in SystemZ
 ; getVectorInstrCost().
-define void @fun1(double %0) {
+define void @fun1(double %0) local_unnamed_addr {
 ; CHECK-LABEL: define void @fun1(
-; CHECK:         phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fmul double
-; CHECK-NEXT:    fmul double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK:         fcmp olt double
-; CHECK-NEXT:    fcmp olt double
-; CHECK-NEXT:    or i1
-;
-; REMARK-LABEL: Function: fun1
-; REMARK: Args:
-; REMARK:      - String:          'List vectorization was possible but not beneficial with cost '
-; REMARK-NEXT: - Cost:            '0'
+; CHECK:    fsub <2 x double>
+; CHECK:    fsub <2 x double>
+; CHECK:    fsub <2 x double>
+; CHECK:    fmul <2 x double>
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:    %14 = fcmp olt <2 x double> %13, %2
 
   br label %2
 
@@ -104,14 +71,7 @@ declare double @llvm.fmuladd.f64(double, double, double)
 ; which is recognized in SystemZTTImpl::getScalarizationOverhead().
 define void @fun2(ptr %0, ptr %Dst) {
 ; CHECK-LABEL: define void @fun2(
-; CHECK: insertelement
-; CHECK: store <2 x i64>
-;
-; REMARK-LABEL: Function: fun2
-; REMARK: Args:
-; REMARK-NEXT: - String:          'Stores SLP vectorized with cost '
-; REMARK-NEXT: - Cost:            '-1'
-
+; CHECK-NOT: store <2 x i64>
   %3 = load i64, ptr %0, align 8
   %4 = icmp eq i64 %3, 0
   br i1 %4, label %5, label %6