Skip to content

Commit 594147f

Browse files
committed
Improvements to vector elements insertion costs.
1 parent aba39c3 commit 594147f

File tree

4 files changed

+79
-72
lines changed

4 files changed

+79
-72
lines changed

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,33 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
469469
return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
470470
}
471471

472+
InstructionCost SystemZTTIImpl::
473+
getScalarizationOverhead(VectorType *Ty,
474+
const APInt &DemandedElts,
475+
bool Insert, bool Extract,
476+
TTI::TargetCostKind CostKind) {
477+
unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
478+
InstructionCost Cost = 0;
479+
480+
if (Insert && Ty->isIntOrIntVectorTy(64)) {
481+
// VLVGP will insert two GPRs with one instruction.
482+
InstructionCost CurrVectorCost = 0;
483+
for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
484+
if (DemandedElts[Idx])
485+
++CurrVectorCost;
486+
if (Idx % 2 == 1) {
487+
Cost += std::min(InstructionCost(1), CurrVectorCost);
488+
CurrVectorCost = 0;
489+
}
490+
}
491+
Insert = false;
492+
}
493+
494+
Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
495+
Extract, CostKind);
496+
return Cost;
497+
}
498+
472499
// Return the bit size for the scalar type or vector element
473500
// type. getScalarSizeInBits() returns 0 for a pointer type.
474501
static unsigned getScalarSizeInBits(Type *Ty) {
@@ -610,7 +637,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
610637
if (DivRemConst) {
611638
SmallVector<Type *> Tys(Args.size(), Ty);
612639
return VF * DivMulSeqCost +
613-
getScalarizationOverhead(VTy, Args, Tys, CostKind);
640+
BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
614641
}
615642
if ((SignedDivRem || UnsignedDivRem) && VF > 4)
616643
// Temporary hack: disable high vectorization factors with integer
@@ -637,7 +664,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
637664
SmallVector<Type *> Tys(Args.size(), Ty);
638665
InstructionCost Cost =
639666
(VF * ScalarCost) +
640-
getScalarizationOverhead(VTy, Args, Tys, CostKind);
667+
BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
641668
// FIXME: VF 2 for these FP operations are currently just as
642669
// expensive as for VF 4.
643670
if (VF == 2)
@@ -655,8 +682,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
655682
// There is no native support for FRem.
656683
if (Opcode == Instruction::FRem) {
657684
SmallVector<Type *> Tys(Args.size(), Ty);
658-
InstructionCost Cost = (VF * LIBCALL_COST) +
659-
getScalarizationOverhead(VTy, Args, Tys, CostKind);
685+
InstructionCost Cost =
686+
(VF * LIBCALL_COST) +
687+
BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
660688
// FIXME: VF 2 for float is currently just as expensive as for VF 4.
661689
if (VF == 2 && ScalarBits == 32)
662690
Cost *= 2;
@@ -976,10 +1004,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
9761004
(Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
9771005
NeedsExtracts = false;
9781006

979-
TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
980-
NeedsExtracts, CostKind);
981-
TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
982-
/*Extract*/ false, CostKind);
1007+
TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1008+
NeedsExtracts, CostKind);
1009+
TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
1010+
/*Extract*/ false, CostKind);
9831011

9841012
// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
9851013
if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -991,8 +1019,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
9911019
if (Opcode == Instruction::FPTrunc) {
9921020
if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
9931021
return VF /*ldxbr/lexbr*/ +
994-
getScalarizationOverhead(DstVecTy, /*Insert*/ true,
995-
/*Extract*/ false, CostKind);
1022+
BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
1023+
/*Extract*/ false, CostKind);
9961024
else // double -> float
9971025
return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
9981026
}
@@ -1005,8 +1033,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
10051033
return VF * 2;
10061034
}
10071035
// -> fp128. VF * lxdb/lxeb + extraction of elements.
1008-
return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1009-
/*Extract*/ true, CostKind);
1036+
return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1037+
/*Extract*/ true, CostKind);
10101038
}
10111039
}
10121040

@@ -1115,10 +1143,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
11151143
TTI::TargetCostKind CostKind,
11161144
unsigned Index, Value *Op0,
11171145
Value *Op1) {
1118-
// vlvgp will insert two grs into a vector register, so only count half the
1119-
// number of instructions.
1120-
if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
1121-
return ((Index % 2 == 0) ? 1 : 0);
1146+
if (Opcode == Instruction::InsertElement) {
1147+
// Vector Element Load.
1148+
if (Op1 != nullptr && Op1->hasOneUse() && isa<LoadInst>(Op1))
1149+
return 0;
1150+
1151+
// vlvgp will insert two grs into a vector register, so count half the
1152+
// number of instructions as an estimate when we don't have the full
1153+
// picture (as in getScalarizationOverhead()).
1154+
if (Val->isIntOrIntVectorTy(64))
1155+
return ((Index % 2 == 0) ? 1 : 0);
1156+
}
11221157

11231158
if (Opcode == Instruction::ExtractElement) {
11241159
int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
8181
bool hasDivRemOp(Type *DataType, bool IsSigned);
8282
bool prefersVectorizedAddressing() { return false; }
8383
bool LSRWithInstrQueries() { return true; }
84+
InstructionCost getScalarizationOverhead(VectorType *Ty,
85+
const APInt &DemandedElts,
86+
bool Insert, bool Extract,
87+
TTI::TargetCostKind CostKind);
8488
bool supportsEfficientVectorElementLoadStore() { return true; }
8589
bool enableInterleavedAccessVectorization() { return true; }
8690

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3033,8 +3033,8 @@ class BoUpSLP {
30333033
unsigned NumParts, bool ForOrder = false);
30343034

30353035
/// \returns the scalarization cost for this list of values. Assuming that
3036-
/// this subtree gets vectorized, we may need to extract the values from the
3037-
/// roots. This method calculates the cost of extracting the values.
3036+
/// this subtree gets vectorized, we may need to insert the values from the
3037+
/// roots. This method calculates the cost of inserting the values.
30383038
/// \param ForPoisonSrc true if initial vector is poison, false otherwise.
30393039
InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
30403040
Type *ScalarTy) const;
@@ -13013,7 +13013,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1301313013
TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
1301413014
I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
1301513015
} else {
13016-
Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
13016+
// Add insertion costs for all elements, but not for loads that can be
13017+
// loaded directly into a vector element for free.
13018+
APInt FreeEltLoads = APInt::getZero(VL.size());
13019+
if (TTI->supportsEfficientVectorElementLoadStore())
13020+
for (unsigned I = 0, E = VL.size(); I < E; ++I)
13021+
if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
13022+
FreeEltLoads.setBit(I);
13023+
APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;
13024+
Cost = TTI->getScalarizationOverhead(VecTy, DemandedElts,
1301713025
/*Insert*/ true,
1301813026
/*Extract*/ false, CostKind);
1301913027
}

llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll

Lines changed: 13 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
1-
; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
2-
; RUN: -pass-remarks-output=%t | FileCheck %s
3-
; RUN: cat %t | FileCheck -check-prefix=REMARK %s
4-
;
5-
; NB! This is a pre-commit version (for #112491) with current codegen and remarks.
1+
; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
62
;
73
; Test functions that (at least currently) only gets vectorized if the
84
; insertion cost for an element load is counted as free.
@@ -11,19 +7,8 @@
117
; getGatherCost().
128
define void @fun0(ptr nocapture %0, double %1) {
139
; CHECK-LABEL: define void @fun0(
14-
; CHECK: fmul double
15-
; CHECK: call double @llvm.fmuladd.f64(
16-
; CHECK-NEXT: call double @llvm.fmuladd.f64(
17-
; CHECK-NEXT: call double @llvm.sqrt.f64(
18-
; CHECK: fmul double
19-
; CHECK: call double @llvm.fmuladd.f64(
20-
; CHECK-NEXT: call double @llvm.fmuladd.f64(
21-
; CHECK-NEXT: call double @llvm.sqrt.f64(
22-
;
23-
; REMARK-LABEL: Function: fun0
24-
; REMARK: Args:
25-
; REMARK-NEXT: - String: 'List vectorization was possible but not beneficial with cost '
26-
; REMARK-NEXT: - Cost: '0'
10+
; CHECK: fmul <2 x double>
11+
; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
2712

2813
%3 = fmul double %1, 2.000000e+00
2914
%4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
@@ -39,36 +24,18 @@ define void @fun0(ptr nocapture %0, double %1) {
3924
ret void
4025
}
4126

27+
4228
; This function needs the element-load to be recognized in SystemZ
4329
; getVectorInstrCost().
44-
define void @fun1(double %0) {
30+
define void @fun1(double %0) local_unnamed_addr {
4531
; CHECK-LABEL: define void @fun1(
46-
; CHECK: phi double
47-
; CHECK-NEXT: phi double
48-
; CHECK-NEXT: phi double
49-
; CHECK-NEXT: phi double
50-
; CHECK-NEXT: phi double
51-
; CHECK-NEXT: phi double
52-
; CHECK-NEXT: fsub double
53-
; CHECK-NEXT: fsub double
54-
; CHECK-NEXT: fmul double
55-
; CHECK-NEXT: fmul double
56-
; CHECK-NEXT: fsub double
57-
; CHECK-NEXT: fsub double
58-
; CHECK-NEXT: call double @llvm.fmuladd.f64(
59-
; CHECK-NEXT: call double @llvm.fmuladd.f64(
60-
; CHECK-NEXT: fsub double
61-
; CHECK-NEXT: fsub double
62-
; CHECK-NEXT: call double @llvm.fmuladd.f64(
63-
; CHECK-NEXT: call double @llvm.fmuladd.f64(
64-
; CHECK: fcmp olt double
65-
; CHECK-NEXT: fcmp olt double
66-
; CHECK-NEXT: or i1
67-
;
68-
; REMARK-LABEL: Function: fun1
69-
; REMARK: Args:
70-
; REMARK: - String: 'List vectorization was possible but not beneficial with cost '
71-
; REMARK-NEXT: - Cost: '0'
32+
; CHECK: fsub <2 x double>
33+
; CHECK: fsub <2 x double>
34+
; CHECK: fsub <2 x double>
35+
; CHECK: fmul <2 x double>
36+
; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
37+
; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
38+
; CHECK: %14 = fcmp olt <2 x double> %13, %2
7239

7340
br label %2
7441

@@ -104,14 +71,7 @@ declare double @llvm.fmuladd.f64(double, double, double)
10471
; which is recognized in SystemZTTImpl::getScalarizationOverhead().
10572
define void @fun2(ptr %0, ptr %Dst) {
10673
; CHECK-LABEL: define void @fun2(
107-
; CHECK: insertelement
108-
; CHECK: store <2 x i64>
109-
;
110-
; REMARK-LABEL: Function: fun2
111-
; REMARK: Args:
112-
; REMARK-NEXT: - String: 'Stores SLP vectorized with cost '
113-
; REMARK-NEXT: - Cost: '-1'
114-
74+
; CHECK-NOT: store <2 x i64>
11575
%3 = load i64, ptr %0, align 8
11676
%4 = icmp eq i64 %3, 0
11777
br i1 %4, label %5, label %6

0 commit comments

Comments
 (0)