Improvements to vector elements insertion costs.

JonPsson1 · JonPsson1 · commit b75e1b5ccc83 · 2024-10-15T14:14:27.000+02:00
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -469,6 +469,33 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
 }
 
+InstructionCost SystemZTTIImpl::
+getScalarizationOverhead(VectorType *Ty,
+                         const APInt &DemandedElts,
+                         bool Insert, bool Extract,
+                         TTI::TargetCostKind CostKind) {
+  unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+  InstructionCost Cost = 0;
+
+  if (Insert && Ty->isIntOrIntVectorTy(64)) {
+    // VLVGP will insert two GPRs with one instruction.
+    InstructionCost CurrVectorCost = 0;
+    for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+      if (DemandedElts[Idx])
+        ++CurrVectorCost;
+      if (Idx % 2 == 1) {
+        Cost += std::min(InstructionCost(1), CurrVectorCost);
+        CurrVectorCost = 0;
+      }
+    }
+    Insert = false;
+  }
+
+  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
+                                          Extract, CostKind);
+  return Cost;
+}
+
 // Return the bit size for the scalar type or vector element
 // type. getScalarSizeInBits() returns 0 for a pointer type.
 static unsigned getScalarSizeInBits(Type *Ty) {
@@ -610,7 +637,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     if (DivRemConst) {
       SmallVector<Type *> Tys(Args.size(), Ty);
       return VF * DivMulSeqCost +
-             getScalarizationOverhead(VTy, Args, Tys, CostKind);
+             BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
     }
     if ((SignedDivRem || UnsignedDivRem) && VF > 4)
       // Temporary hack: disable high vectorization factors with integer
@@ -637,7 +664,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
         SmallVector<Type *> Tys(Args.size(), Ty);
         InstructionCost Cost =
             (VF * ScalarCost) +
-            getScalarizationOverhead(VTy, Args, Tys, CostKind);
+            BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
         // FIXME: VF 2 for these FP operations are currently just as
         // expensive as for VF 4.
         if (VF == 2)
@@ -655,8 +682,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     // There is no native support for FRem.
     if (Opcode == Instruction::FRem) {
       SmallVector<Type *> Tys(Args.size(), Ty);
-      InstructionCost Cost = (VF * LIBCALL_COST) +
-                             getScalarizationOverhead(VTy, Args, Tys, CostKind);
+      InstructionCost Cost =
+          (VF * LIBCALL_COST) +
+          BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
       // FIXME: VF 2 for float is currently just as expensive as for VF 4.
       if (VF == 2 && ScalarBits == 32)
         Cost *= 2;
@@ -976,10 +1004,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
         NeedsExtracts = false;
 
-      TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
-                                          NeedsExtracts, CostKind);
-      TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
-                                          /*Extract*/ false, CostKind);
+      TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                                 NeedsExtracts, CostKind);
+      TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
+                                                 /*Extract*/ false, CostKind);
 
       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -991,8 +1019,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     if (Opcode == Instruction::FPTrunc) {
       if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
         return VF /*ldxbr/lexbr*/ +
-               getScalarizationOverhead(DstVecTy, /*Insert*/ true,
-                                        /*Extract*/ false, CostKind);
+               BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
+                                               /*Extract*/ false, CostKind);
       else // double -> float
         return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
     }
@@ -1005,8 +1033,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
         return VF * 2;
       }
       // -> fp128.  VF * lxdb/lxeb + extraction of elements.
-      return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
-                                           /*Extract*/ true, CostKind);
+      return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                                  /*Extract*/ true, CostKind);
     }
   }
 
@@ -1115,10 +1143,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    TTI::TargetCostKind CostKind,
                                                    unsigned Index, Value *Op0,
                                                    Value *Op1) {
-  // vlvgp will insert two grs into a vector register, so only count half the
-  // number of instructions.
-  if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
-    return ((Index % 2 == 0) ? 1 : 0);
+  if (Opcode == Instruction::InsertElement) {
+    // Vector Element Load.
+    if (Op1 != nullptr && Op1->hasOneUse() && isa<LoadInst>(Op1))
+      return 0;
+
+    // vlvgp will insert two grs into a vector register, so count half the
+    // number of instructions as an estimate when we don't have the full
+    // picture (as in getScalarizationOverhead()).
+    if (Val->isIntOrIntVectorTy(64))
+      return ((Index % 2 == 0) ? 1 : 0);
+  }
 
   if (Opcode == Instruction::ExtractElement) {
     int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -81,6 +81,10 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool prefersVectorizedAddressing() { return false; }
   bool LSRWithInstrQueries() { return true; }
+  InstructionCost getScalarizationOverhead(VectorType *Ty,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind);
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3025,8 +3025,8 @@ class BoUpSLP {
       unsigned NumParts, bool ForOrder = false);
 
   /// \returns the scalarization cost for this list of values. Assuming that
-  /// this subtree gets vectorized, we may need to extract the values from the
-  /// roots. This method calculates the cost of extracting the values.
+  /// this subtree gets vectorized, we may need to insert the values from the
+  /// roots. This method calculates the cost of inserting the values.
   /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
   InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
                                 Type *ScalarTy) const;
@@ -12897,7 +12897,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
               TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
               I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
     } else {
-      Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
+      // Add insertion costs for all elements, but not for loads that can be
+      // loaded directly into a vector element for free.
+      APInt FreeEltLoads = APInt::getZero(VL.size());
+      if (TTI->supportsEfficientVectorElementLoadStore())
+        for (unsigned I = 0, E = VL.size(); I < E; ++I)
+          if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
+            FreeEltLoads.setBit(I);
+      APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;
+      Cost = TTI->getScalarizationOverhead(VecTy, DemandedElts,
                                            /*Insert*/ true,
                                            /*Extract*/ false, CostKind);
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -0,0 +1,88 @@
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
+;
+; Test functions that (at least currently) only gets vectorized if the
+; insertion cost for an element load is counted as free.
+
+; This function needs the free element load to be recognized in SLP
+; getGatherCost().
+define void @fun0(ptr nocapture %0, double %1) {
+; CHECK-LABEL: define void @fun0(
+; CHECK:    fmul <2 x double>
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+
+  %3 = fmul double %1, 2.000000e+00
+  %4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
+  %5 = tail call double @llvm.fmuladd.f64(double %3, double %3, double %4)
+  %sqrt1 = tail call double @llvm.sqrt.f64(double %5)
+  %6 = load double, ptr %0, align 8
+  %7 = fmul double %6, 2.000000e+00
+  %8 = tail call double @llvm.fmuladd.f64(double %7, double %7, double 0.000000e+00)
+  %9 = tail call double @llvm.fmuladd.f64(double %7, double %7, double %8)
+  %sqrt = tail call double @llvm.sqrt.f64(double %9)
+  %10 = fadd double %sqrt1, %sqrt
+  store double %10, ptr %0, align 8
+  ret void
+}
+
+
+; This function needs the element-load to be recognized in SystemZ
+; getVectorInstrCost().
+define void @fun1(double %0) local_unnamed_addr {
+; CHECK-LABEL: define void @fun1(
+; CHECK:    fsub <2 x double>
+; CHECK:    fsub <2 x double>
+; CHECK:    fsub <2 x double>
+; CHECK:    fmul <2 x double>
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:    %14 = fcmp olt <2 x double> %13, %2
+
+  br label %2
+
+2:
+  %3 = phi double [ poison, %1 ], [ poison, %2 ]
+  %4 = phi double [ undef, %1 ], [ poison, %2 ]
+  %5 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
+  %6 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
+  %7 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
+  %8 = phi double [ 0.000000e+00, %1 ], [ %21, %2 ]
+  %9 = fsub double 0.000000e+00, %8
+  %10 = fsub double 0.000000e+00, %7
+  %11 = fmul double %9, 0.000000e+00
+  %12 = fmul double %10, 0.000000e+00
+  %13 = fsub double 0.000000e+00, %6
+  %14 = fsub double 0.000000e+00, %5
+  %15 = tail call double @llvm.fmuladd.f64(double %13, double %13, double %11)
+  %16 = tail call double @llvm.fmuladd.f64(double %14, double %14, double %12)
+  %17 = fsub double 0.000000e+00, %4
+  %18 = fsub double 0.000000e+00, %3
+  %19 = tail call double @llvm.fmuladd.f64(double %17, double %17, double %15)
+  %20 = tail call double @llvm.fmuladd.f64(double %18, double %18, double %16)
+  %21 = load double, ptr null, align 8
+  %22 = fcmp olt double %19, %0
+  %23 = fcmp olt double %20, 0.000000e+00
+  %24 = or i1 %23, %22
+  br label %2
+}
+
+declare double @llvm.fmuladd.f64(double, double, double)
+
+; This should *not* be vectorized as the insertion into the vector isn't free,
+; which is recognized in SystemZTTImpl::getScalarizationOverhead().
+define void @fun2(ptr %0, ptr %Dst) {
+; CHECK-LABEL: define void @fun2(
+; CHECK-NOT: store <2 x i64>
+  %3 = load i64, ptr %0, align 8
+  %4 = icmp eq i64 %3, 0
+  br i1 %4, label %5, label %6
+
+5:
+  ret void
+
+6:
+  %7 = getelementptr i8, ptr %Dst, i64 24
+  store i64 %3, ptr %7, align 8
+  %8 = getelementptr i8, ptr %Dst, i64 16
+  store i64 0, ptr %8, align 8
+  br label %5
+}