-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SystemZ] SLP reductions: cost functions of reductions and scalarization #112491
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-backend-systemz Author: Jonas Paulsson (JonPsson1) ChangesThis patch has two separate commits that are kind of independent, even though the second one benefits from the first one.
The (preliminary) results on benchmarks are not very significant, but even so this should in theory be a general improvement resulting in slightly increased vectorization (SLP). Integer reduction costs should also be addressed, and probably some more benchmarking should be done before committing. @dominik-steenken (not sure why, but I could not add you as a reviewer) Patch is 75.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112491.diff 8 Files Affected:
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 7e5728c40950ad..9ab5a77280e6f9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -469,6 +469,33 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
}
+InstructionCost SystemZTTIImpl::
+getScalarizationOverhead(VectorType *Ty,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind) {
+ unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+ InstructionCost Cost = 0;
+
+ if (Insert && Ty->isIntOrIntVectorTy(64)) {
+ // VLVGP will insert two GPRs with one instruction.
+ InstructionCost CurrVectorCost = 0;
+ for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+ if (DemandedElts[Idx])
+ ++CurrVectorCost;
+ if (Idx % 2 == 1) {
+ Cost += std::min(InstructionCost(1), CurrVectorCost);
+ CurrVectorCost = 0;
+ }
+ }
+ Insert = false;
+ }
+
+ Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
+ Extract, CostKind);
+ return Cost;
+}
+
// Return the bit size for the scalar type or vector element
// type. getScalarSizeInBits() returns 0 for a pointer type.
static unsigned getScalarSizeInBits(Type *Ty) {
@@ -610,7 +637,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
if (DivRemConst) {
SmallVector<Type *> Tys(Args.size(), Ty);
return VF * DivMulSeqCost +
- getScalarizationOverhead(VTy, Args, Tys, CostKind);
+ BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
}
if ((SignedDivRem || UnsignedDivRem) && VF > 4)
// Temporary hack: disable high vectorization factors with integer
@@ -637,7 +664,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
SmallVector<Type *> Tys(Args.size(), Ty);
InstructionCost Cost =
(VF * ScalarCost) +
- getScalarizationOverhead(VTy, Args, Tys, CostKind);
+ BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
// FIXME: VF 2 for these FP operations are currently just as
// expensive as for VF 4.
if (VF == 2)
@@ -655,8 +682,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
// There is no native support for FRem.
if (Opcode == Instruction::FRem) {
SmallVector<Type *> Tys(Args.size(), Ty);
- InstructionCost Cost = (VF * LIBCALL_COST) +
- getScalarizationOverhead(VTy, Args, Tys, CostKind);
+ InstructionCost Cost =
+ (VF * LIBCALL_COST) +
+ BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
// FIXME: VF 2 for float is currently just as expensive as for VF 4.
if (VF == 2 && ScalarBits == 32)
Cost *= 2;
@@ -976,10 +1004,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
(Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
NeedsExtracts = false;
- TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
- NeedsExtracts, CostKind);
- TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
- /*Extract*/ false, CostKind);
+ TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+ NeedsExtracts, CostKind);
+ TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
+ /*Extract*/ false, CostKind);
// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -991,8 +1019,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
if (Opcode == Instruction::FPTrunc) {
if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
return VF /*ldxbr/lexbr*/ +
- getScalarizationOverhead(DstVecTy, /*Insert*/ true,
- /*Extract*/ false, CostKind);
+ BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
+ /*Extract*/ false, CostKind);
else // double -> float
return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
}
@@ -1005,8 +1033,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
return VF * 2;
}
// -> fp128. VF * lxdb/lxeb + extraction of elements.
- return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
- /*Extract*/ true, CostKind);
+ return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+ /*Extract*/ true, CostKind);
}
}
@@ -1115,10 +1143,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index, Value *Op0,
Value *Op1) {
- // vlvgp will insert two grs into a vector register, so only count half the
- // number of instructions.
- if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
- return ((Index % 2 == 0) ? 1 : 0);
+ if (Opcode == Instruction::InsertElement) {
+ // Vector Element Load.
+ if (Op1 != nullptr && Op1->hasOneUse() && isa<LoadInst>(Op1))
+ return 0;
+
+ // vlvgp will insert two grs into a vector register, so count half the
+ // number of instructions as an estimate when we don't have the full
+ // picture (as in getScalarizationOverhead()).
+ if (Val->isIntOrIntVectorTy(64))
+ return ((Index % 2 == 0) ? 1 : 0);
+ }
if (Opcode == Instruction::ExtractElement) {
int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
@@ -1353,6 +1388,57 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
return NumVectorMemOps + NumPermutes;
}
+// EXPERIMENTAL
+static cl::opt<unsigned> REDLIM("redlim", cl::init(0));
+
+InstructionCost getFPReductionCost(unsigned NumVec, unsigned ScalarBits) {
+ unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
+ InstructionCost Cost = 0;
+ Cost += NumVec - 1; // Full vector operations.
+ Cost += NumEltsPerVecReg; // Last vector scalar operations.
+ return Cost;
+}
+
+InstructionCost
+SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+ std::optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind) {
+ if (!TTI::requiresOrderedReduction(FMF) && ST->hasVector() &&
+ (Opcode == Instruction::FAdd || Opcode == Instruction::FMul)) {
+ unsigned NumVectors = getNumVectorRegs(Ty);
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+ // // EXPERIMENTAL: better to not vectorize small vectors?:
+ // unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+ // if (NumElts <= REDLIM)
+ // return NumVectors * 8; // => MachineCombiner
+
+ // // EXPERIMENTAL: Return a low cost to enable heavily.
+ // return NumVectors / 2;
+
+ return getFPReductionCost(NumVectors, ScalarBits);
+ }
+
+ return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+}
+
+InstructionCost
+SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+ FastMathFlags FMF,
+ TTI::TargetCostKind CostKind) {
+ if (Ty->isFPOrFPVectorTy() && ST->hasVectorEnhancements1()) {
+ unsigned NumVectors = getNumVectorRegs(Ty);
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+ // // EXPERIMENTAL: Return a low cost to enable heavily.
+ // return NumVectors / 2;
+
+ return getFPReductionCost(NumVectors, ScalarBits);
+ }
+
+ return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
+}
+
static int
getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
const SmallVectorImpl<Type *> &ParamTys) {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 8cc71a6c528f82..b65e75ab98814c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -81,6 +81,10 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
bool hasDivRemOp(Type *DataType, bool IsSigned);
bool prefersVectorizedAddressing() { return false; }
bool LSRWithInstrQueries() { return true; }
+ InstructionCost getScalarizationOverhead(VectorType *Ty,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind);
bool supportsEfficientVectorElementLoadStore() { return true; }
bool enableInterleavedAccessVectorization() { return true; }
@@ -125,6 +129,13 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
+ InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+ std::optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind);
+ InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+ FastMathFlags FMF,
+ TTI::TargetCostKind CostKind);
+
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 595aed2cab182b..a4fedb34785e16 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3025,8 +3025,8 @@ class BoUpSLP {
unsigned NumParts, bool ForOrder = false);
/// \returns the scalarization cost for this list of values. Assuming that
- /// this subtree gets vectorized, we may need to extract the values from the
- /// roots. This method calculates the cost of extracting the values.
+ /// this subtree gets vectorized, we may need to insert the values from the
+ /// roots. This method calculates the cost of inserting the values.
/// \param ForPoisonSrc true if initial vector is poison, false otherwise.
InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
Type *ScalarTy) const;
@@ -12897,7 +12897,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
} else {
- Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
+ // Add insertion costs for all elements, but not for loads that can be
+ // loaded directly into a vector element for free.
+ APInt FreeEltLoads = APInt::getZero(VL.size());
+ if (TTI->supportsEfficientVectorElementLoadStore())
+ for (unsigned I = 0, E = VL.size(); I < E; ++I)
+ if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
+ FreeEltLoads.setBit(I);
+ APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;
+ Cost = TTI->getScalarizationOverhead(VecTy, DemandedElts,
/*Insert*/ true,
/*Extract*/ false, CostKind);
}
diff --git a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
new file mode 100644
index 00000000000000..055c25298d847e
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
+; RUN: -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15
+
+define void @fadd_reductions() {
+; Z15-LABEL: 'fadd_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+ %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+ %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+ %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+ %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+ ret void
+}
+
+define void @fast_fadd_reductions() {
+; Z15-LABEL: 'fast_fadd_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+ %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+ %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+ %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+ %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+
+ ret void
+}
+
+define void @fmul_reductions() {
+; Z15-LABEL: 'fmul_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+ %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+ %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+ %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+ %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+ ret void
+}
+
+define void @fast_fmul_reductions() {
+; Z15-LABEL: 'fast_fmul_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+ %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+ %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+ %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+ %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+
+ ret void
+}
+
+define void @fmin_reductions() {
+; Z15-LABEL: 'fmin_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+ %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+ %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+ %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+ %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+ ret void
+}
+
+define void @fmax_reductions() {
+; Z15-LABEL: 'fmax_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
// loaded directly into a vector element for free. | ||
APInt FreeEltLoads = APInt::getZero(VL.size()); | ||
if (TTI->supportsEfficientVectorElementLoadStore()) | ||
for (unsigned I = 0, E = VL.size(); I < E; ++I) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for (unsigned I = 0, E = VL.size(); I < E; ++I) | |
for (unsigned I : seq<unsigned>(VL.size())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed - thanks. Does this change to SLPVectorizer.cpp now look ok?
532984d
to
3c61251
Compare
To simplify the reivew-process I have reverted the SystemZ FP reduction costs part, so that only the insertion costs improvements are dealt with first (revert 9be511d to get them back). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SystemZ cost function changes LGTM.
@@ -0,0 +1,88 @@ | |||
; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better precommit the test with the current codegen. Also, would be good to add cost estimations check to show that is modified
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is no difference with cost estimations of the functions in this file - one by one they return the same scalar cost. The interesting thing here is to run SLP and see how it behaves with the improved scalarization costs. So I don't think that running opt with -passes='print' makes much sense. Looking at the SLP output doesn't give me much either, there is just a different number for a particular bundle, but it wouldn't be clear in the test what that means, I think.
This could be precommitted, but I think I would prefer to wait until it is actually ready to be committed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- The test itself better to precommit in a separate NFC patch anyway.
- There should be the difference, otherwise it is meaningless. No need to check other passes, SLP vectorizer has its own cost estimation, which can be obtained via
-pass-remarks-output=
options. There are several tests already, which use it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok - there is indeed a difference with the pass-remarks-output which is quite readable. Test updated.
llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
Outdated
Show resolved
Hide resolved
/// roots. This method calculates the cost of inserting the values. | ||
/// \param ForPoisonSrc true if initial vector is poison, false otherwise. | ||
/// \returns the cost of gathering (inserting) the values in \p VL into a | ||
/// vector. \param ForPoisonSrc true if initial vector is poison, false |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Try to reformat to make \param
on a separate line
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
b5f12c0
to
245e769
Compare
Patch updated per review.
@uweigand: should we precommit this test now or do we wait until we are ready to actually commit this patch? I can't remember that we have done this type of pre-committing of tests in the past, but rather we wait for performance results and then simply add a test that reflects the new behavior... |
Fine with me to precommit the test, so that we see the difference this patch makes. I'll leave it up to you whether to do this right away or later. |
245e769
to
33c9f58
Compare
Test is now precommitted, with patch rebased. |
; CHECK: fmul <2 x double> | ||
; CHECK-NEXT: call <2 x double> @llvm.fmuladd.v2f64( | ||
; CHECK-NEXT: call <2 x double> @llvm.fmuladd.v2f64( | ||
; CHECK-NEXT: call <2 x double> @llvm.sqrt.v2f64( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you auto generate the checks here using the script?
The checks for the costs still will be manual, but the IR checks better to auto-generate
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure
; REMARK: Args: | ||
; REMARK-NEXT: - String: 'Stores SLP vectorized with cost ' | ||
; REMARK-NEXT: - Cost: '-1' | ||
; REMARK-NOT: Function: fun2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to copy here the actual cost rather than use -NOT syntax
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would, but the problem is that SLPVectorizer prints nothing for fun2 - I guess because it doesn't see any opportunity. This seems to be the case also in other tests, like AArch64/gather-cost.ll.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
0a73684
to
a9eddd1
Compare
A slight regression (mcf) has gotten in the way of this patch being committed. It seems that the problem is that when SLP thinks element loads are free, it creates VLREP; VLEG; VST (two element loads and a vector store), out of two separate loads and stores. However, the SystemZ backend prefers MVC (memcopy) even of 8 bytes, so the result is that this sequence replaces two MVCs, and not four instructions. Not sure how to best avoid this:
|
Prefer option 3 |
a9eddd1
to
ba3eb68
Compare
ok - I tried it: TargetTransformInfo: add an optional argument to getScalarizationOverhead() to pass the actual Values. BasicTTIImpl: pass the actual Value to getVectorInstrCost() if present. SLPVectorizer: pass VL to getScalarizationOverhead() instead of changing DemandedElts with supportsEfficientVectorElementLoadStore(). SystemZ: new function isFreeEltLoad() that also checks for the load->store case. @uweigand: In SystemZISelLowering.cpp we have MaxStoresPerMemcpy set to two. But I suppose that's a slightly different case where we use just a single VL per VST. isFreeEltLoad() now avoids all load->store cost discounts, even though on mcf it was i64 only. Does this make sense or could it be that it is better to gather many loads into a single store? mcf is now unnaffected by this patch entirely. Considerably fewer files are changed (~400 -> ~200) over benchmarks. |
/// Insert is true. | ||
InstructionCost getScalarizationOverhead( | ||
VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, | ||
TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = std::nullopt) const; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = std::nullopt) const; | |
TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = {}) const; |
IIRC, std::nullopt is going to be deprecated/dleted for ArrayRef initialization
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, | ||
TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = std::nullopt) { | ||
assert((VL.empty() || | ||
VL.size() == cast<FixedVectorType>(InTy)->getNumElements()) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, the size of the DemandedElts should match VL.size(), if VL is not empty
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
merged it with the other assertion that was already there...
ba3eb68
to
afe3cdf
Compare
Wait with this and first evaluate the scalarization costs separately.
afe3cdf
to
d8269a3
Compare
@alexey-bataev Sorry for the delay on this, I have fixed your latest concerns and updated the patch. Looks ok to you now? |
"Vector size mismatch"); | ||
|
||
InstructionCost Cost = 0; | ||
|
||
for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { | ||
if (!DemandedElts[i]) | ||
continue; | ||
if (Insert) | ||
if (Insert) { | ||
Value *InsertedVal = VL.size() ? VL[i] : nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Value *InsertedVal = VL.size() ? VL[i] : nullptr; | |
Value *InsertedVal = VL.empty() ? nullptr : VL[i]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, I wonder if this is part of the coding guidelines somewhere? It should be equivalent, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it is equivalent, just allows better to understand the logic
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, fine with me...
Co-authored-by: Alexey Bataev <[email protected]>
Thanks for review. |
This patch has two separate commits that are kind of independent, even though the second one benefits from the first one.
Improve the SLPVectorizer getGatherCost() to take into account free vector element loads, as well as related SZTTI cost functions.
Implement SystemZ getArithmeticReductionCost() and getMinMaxReductionCost(). So far only for FP.
The (preliminary) results on benchmarks are not very significant, but even so this should in theory be a general improvement resulting in slightly increased vectorization (SLP). Integer reduction costs should also be addressed, and probably some more benchmarking should be done before committing.
@dominik-steenken (not sure why, but I could not add you as a reviewer)
Notes: