Skip to content

Commit b75e1b5

Browse files
committed
Improvements to vector elements insertion costs.
1 parent c76045d commit b75e1b5

File tree

4 files changed

+154
-19
lines changed

4 files changed

+154
-19
lines changed

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,33 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
469469
return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
470470
}
471471

472+
InstructionCost SystemZTTIImpl::
473+
getScalarizationOverhead(VectorType *Ty,
474+
const APInt &DemandedElts,
475+
bool Insert, bool Extract,
476+
TTI::TargetCostKind CostKind) {
477+
unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
478+
InstructionCost Cost = 0;
479+
480+
if (Insert && Ty->isIntOrIntVectorTy(64)) {
481+
// VLVGP will insert two GPRs with one instruction.
482+
InstructionCost CurrVectorCost = 0;
483+
for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
484+
if (DemandedElts[Idx])
485+
++CurrVectorCost;
486+
if (Idx % 2 == 1) {
487+
Cost += std::min(InstructionCost(1), CurrVectorCost);
488+
CurrVectorCost = 0;
489+
}
490+
}
491+
Insert = false;
492+
}
493+
494+
Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
495+
Extract, CostKind);
496+
return Cost;
497+
}
498+
472499
// Return the bit size for the scalar type or vector element
473500
// type. getScalarSizeInBits() returns 0 for a pointer type.
474501
static unsigned getScalarSizeInBits(Type *Ty) {
@@ -610,7 +637,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
610637
if (DivRemConst) {
611638
SmallVector<Type *> Tys(Args.size(), Ty);
612639
return VF * DivMulSeqCost +
613-
getScalarizationOverhead(VTy, Args, Tys, CostKind);
640+
BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
614641
}
615642
if ((SignedDivRem || UnsignedDivRem) && VF > 4)
616643
// Temporary hack: disable high vectorization factors with integer
@@ -637,7 +664,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
637664
SmallVector<Type *> Tys(Args.size(), Ty);
638665
InstructionCost Cost =
639666
(VF * ScalarCost) +
640-
getScalarizationOverhead(VTy, Args, Tys, CostKind);
667+
BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
641668
// FIXME: VF 2 for these FP operations are currently just as
642669
// expensive as for VF 4.
643670
if (VF == 2)
@@ -655,8 +682,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
655682
// There is no native support for FRem.
656683
if (Opcode == Instruction::FRem) {
657684
SmallVector<Type *> Tys(Args.size(), Ty);
658-
InstructionCost Cost = (VF * LIBCALL_COST) +
659-
getScalarizationOverhead(VTy, Args, Tys, CostKind);
685+
InstructionCost Cost =
686+
(VF * LIBCALL_COST) +
687+
BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
660688
// FIXME: VF 2 for float is currently just as expensive as for VF 4.
661689
if (VF == 2 && ScalarBits == 32)
662690
Cost *= 2;
@@ -976,10 +1004,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
9761004
(Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
9771005
NeedsExtracts = false;
9781006

979-
TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
980-
NeedsExtracts, CostKind);
981-
TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
982-
/*Extract*/ false, CostKind);
1007+
TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1008+
NeedsExtracts, CostKind);
1009+
TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
1010+
/*Extract*/ false, CostKind);
9831011

9841012
// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
9851013
if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -991,8 +1019,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
9911019
if (Opcode == Instruction::FPTrunc) {
9921020
if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
9931021
return VF /*ldxbr/lexbr*/ +
994-
getScalarizationOverhead(DstVecTy, /*Insert*/ true,
995-
/*Extract*/ false, CostKind);
1022+
BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
1023+
/*Extract*/ false, CostKind);
9961024
else // double -> float
9971025
return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
9981026
}
@@ -1005,8 +1033,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
10051033
return VF * 2;
10061034
}
10071035
// -> fp128. VF * lxdb/lxeb + extraction of elements.
1008-
return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1009-
/*Extract*/ true, CostKind);
1036+
return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1037+
/*Extract*/ true, CostKind);
10101038
}
10111039
}
10121040

@@ -1115,10 +1143,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
11151143
TTI::TargetCostKind CostKind,
11161144
unsigned Index, Value *Op0,
11171145
Value *Op1) {
1118-
// vlvgp will insert two grs into a vector register, so only count half the
1119-
// number of instructions.
1120-
if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
1121-
return ((Index % 2 == 0) ? 1 : 0);
1146+
if (Opcode == Instruction::InsertElement) {
1147+
// Vector Element Load.
1148+
if (Op1 != nullptr && Op1->hasOneUse() && isa<LoadInst>(Op1))
1149+
return 0;
1150+
1151+
// vlvgp will insert two grs into a vector register, so count half the
1152+
// number of instructions as an estimate when we don't have the full
1153+
// picture (as in getScalarizationOverhead()).
1154+
if (Val->isIntOrIntVectorTy(64))
1155+
return ((Index % 2 == 0) ? 1 : 0);
1156+
}
11221157

11231158
if (Opcode == Instruction::ExtractElement) {
11241159
int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
8181
bool hasDivRemOp(Type *DataType, bool IsSigned);
8282
bool prefersVectorizedAddressing() { return false; }
8383
bool LSRWithInstrQueries() { return true; }
84+
InstructionCost getScalarizationOverhead(VectorType *Ty,
85+
const APInt &DemandedElts,
86+
bool Insert, bool Extract,
87+
TTI::TargetCostKind CostKind);
8488
bool supportsEfficientVectorElementLoadStore() { return true; }
8589
bool enableInterleavedAccessVectorization() { return true; }
8690

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3025,8 +3025,8 @@ class BoUpSLP {
30253025
unsigned NumParts, bool ForOrder = false);
30263026

30273027
/// \returns the scalarization cost for this list of values. Assuming that
3028-
/// this subtree gets vectorized, we may need to extract the values from the
3029-
/// roots. This method calculates the cost of extracting the values.
3028+
/// this subtree gets vectorized, we may need to insert the values from the
3029+
/// roots. This method calculates the cost of inserting the values.
30303030
/// \param ForPoisonSrc true if initial vector is poison, false otherwise.
30313031
InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
30323032
Type *ScalarTy) const;
@@ -12897,7 +12897,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1289712897
TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
1289812898
I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
1289912899
} else {
12900-
Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
12900+
// Add insertion costs for all elements, but not for loads that can be
12901+
// loaded directly into a vector element for free.
12902+
APInt FreeEltLoads = APInt::getZero(VL.size());
12903+
if (TTI->supportsEfficientVectorElementLoadStore())
12904+
for (unsigned I = 0, E = VL.size(); I < E; ++I)
12905+
if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
12906+
FreeEltLoads.setBit(I);
12907+
APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;
12908+
Cost = TTI->getScalarizationOverhead(VecTy, DemandedElts,
1290112909
/*Insert*/ true,
1290212910
/*Extract*/ false, CostKind);
1290312911
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
2+
;
3+
; Test functions that (at least currently) only gets vectorized if the
4+
; insertion cost for an element load is counted as free.
5+
6+
; This function needs the free element load to be recognized in SLP
7+
; getGatherCost().
8+
define void @fun0(ptr nocapture %0, double %1) {
9+
; CHECK-LABEL: define void @fun0(
10+
; CHECK: fmul <2 x double>
11+
; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
12+
13+
%3 = fmul double %1, 2.000000e+00
14+
%4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
15+
%5 = tail call double @llvm.fmuladd.f64(double %3, double %3, double %4)
16+
%sqrt1 = tail call double @llvm.sqrt.f64(double %5)
17+
%6 = load double, ptr %0, align 8
18+
%7 = fmul double %6, 2.000000e+00
19+
%8 = tail call double @llvm.fmuladd.f64(double %7, double %7, double 0.000000e+00)
20+
%9 = tail call double @llvm.fmuladd.f64(double %7, double %7, double %8)
21+
%sqrt = tail call double @llvm.sqrt.f64(double %9)
22+
%10 = fadd double %sqrt1, %sqrt
23+
store double %10, ptr %0, align 8
24+
ret void
25+
}
26+
27+
28+
; This function needs the element-load to be recognized in SystemZ
29+
; getVectorInstrCost().
30+
define void @fun1(double %0) local_unnamed_addr {
31+
; CHECK-LABEL: define void @fun1(
32+
; CHECK: fsub <2 x double>
33+
; CHECK: fsub <2 x double>
34+
; CHECK: fsub <2 x double>
35+
; CHECK: fmul <2 x double>
36+
; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
37+
; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
38+
; CHECK: %14 = fcmp olt <2 x double> %13, %2
39+
40+
br label %2
41+
42+
2:
43+
%3 = phi double [ poison, %1 ], [ poison, %2 ]
44+
%4 = phi double [ undef, %1 ], [ poison, %2 ]
45+
%5 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
46+
%6 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
47+
%7 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
48+
%8 = phi double [ 0.000000e+00, %1 ], [ %21, %2 ]
49+
%9 = fsub double 0.000000e+00, %8
50+
%10 = fsub double 0.000000e+00, %7
51+
%11 = fmul double %9, 0.000000e+00
52+
%12 = fmul double %10, 0.000000e+00
53+
%13 = fsub double 0.000000e+00, %6
54+
%14 = fsub double 0.000000e+00, %5
55+
%15 = tail call double @llvm.fmuladd.f64(double %13, double %13, double %11)
56+
%16 = tail call double @llvm.fmuladd.f64(double %14, double %14, double %12)
57+
%17 = fsub double 0.000000e+00, %4
58+
%18 = fsub double 0.000000e+00, %3
59+
%19 = tail call double @llvm.fmuladd.f64(double %17, double %17, double %15)
60+
%20 = tail call double @llvm.fmuladd.f64(double %18, double %18, double %16)
61+
%21 = load double, ptr null, align 8
62+
%22 = fcmp olt double %19, %0
63+
%23 = fcmp olt double %20, 0.000000e+00
64+
%24 = or i1 %23, %22
65+
br label %2
66+
}
67+
68+
declare double @llvm.fmuladd.f64(double, double, double)
69+
70+
; This should *not* be vectorized as the insertion into the vector isn't free,
71+
; which is recognized in SystemZTTImpl::getScalarizationOverhead().
72+
define void @fun2(ptr %0, ptr %Dst) {
73+
; CHECK-LABEL: define void @fun2(
74+
; CHECK-NOT: store <2 x i64>
75+
%3 = load i64, ptr %0, align 8
76+
%4 = icmp eq i64 %3, 0
77+
br i1 %4, label %5, label %6
78+
79+
5:
80+
ret void
81+
82+
6:
83+
%7 = getelementptr i8, ptr %Dst, i64 24
84+
store i64 %3, ptr %7, align 8
85+
%8 = getelementptr i8, ptr %Dst, i64 16
86+
store i64 0, ptr %8, align 8
87+
br label %5
88+
}

0 commit comments

Comments
 (0)