@@ -469,6 +469,33 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
469
469
return (VT.isScalarInteger () && TLI->isTypeLegal (VT));
470
470
}
471
471
472
+ InstructionCost SystemZTTIImpl::
473
+ getScalarizationOverhead (VectorType *Ty,
474
+ const APInt &DemandedElts,
475
+ bool Insert, bool Extract,
476
+ TTI::TargetCostKind CostKind) {
477
+ unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements ();
478
+ InstructionCost Cost = 0 ;
479
+
480
+ if (Insert && Ty->isIntOrIntVectorTy (64 )) {
481
+ // VLVGP will insert two GPRs with one instruction.
482
+ InstructionCost CurrVectorCost = 0 ;
483
+ for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
484
+ if (DemandedElts[Idx])
485
+ ++CurrVectorCost;
486
+ if (Idx % 2 == 1 ) {
487
+ Cost += std::min (InstructionCost (1 ), CurrVectorCost);
488
+ CurrVectorCost = 0 ;
489
+ }
490
+ }
491
+ Insert = false ;
492
+ }
493
+
494
+ Cost += BaseT::getScalarizationOverhead (Ty, DemandedElts, Insert,
495
+ Extract, CostKind);
496
+ return Cost;
497
+ }
498
+
472
499
// Return the bit size for the scalar type or vector element
473
500
// type. getScalarSizeInBits() returns 0 for a pointer type.
474
501
static unsigned getScalarSizeInBits (Type *Ty) {
@@ -610,7 +637,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
610
637
if (DivRemConst) {
611
638
SmallVector<Type *> Tys (Args.size (), Ty);
612
639
return VF * DivMulSeqCost +
613
- getScalarizationOverhead (VTy, Args, Tys, CostKind);
640
+ BaseT:: getScalarizationOverhead (VTy, Args, Tys, CostKind);
614
641
}
615
642
if ((SignedDivRem || UnsignedDivRem) && VF > 4 )
616
643
// Temporary hack: disable high vectorization factors with integer
@@ -637,7 +664,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
637
664
SmallVector<Type *> Tys (Args.size (), Ty);
638
665
InstructionCost Cost =
639
666
(VF * ScalarCost) +
640
- getScalarizationOverhead (VTy, Args, Tys, CostKind);
667
+ BaseT:: getScalarizationOverhead (VTy, Args, Tys, CostKind);
641
668
// FIXME: VF 2 for these FP operations are currently just as
642
669
// expensive as for VF 4.
643
670
if (VF == 2 )
@@ -655,8 +682,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
655
682
// There is no native support for FRem.
656
683
if (Opcode == Instruction::FRem) {
657
684
SmallVector<Type *> Tys (Args.size (), Ty);
658
- InstructionCost Cost = (VF * LIBCALL_COST) +
659
- getScalarizationOverhead (VTy, Args, Tys, CostKind);
685
+ InstructionCost Cost =
686
+ (VF * LIBCALL_COST) +
687
+ BaseT::getScalarizationOverhead (VTy, Args, Tys, CostKind);
660
688
// FIXME: VF 2 for float is currently just as expensive as for VF 4.
661
689
if (VF == 2 && ScalarBits == 32 )
662
690
Cost *= 2 ;
@@ -976,10 +1004,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
976
1004
(Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
977
1005
NeedsExtracts = false ;
978
1006
979
- TotCost += getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
980
- NeedsExtracts, CostKind);
981
- TotCost += getScalarizationOverhead (DstVecTy, NeedsInserts,
982
- /* Extract*/ false , CostKind);
1007
+ TotCost += BaseT:: getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
1008
+ NeedsExtracts, CostKind);
1009
+ TotCost += BaseT:: getScalarizationOverhead (DstVecTy, NeedsInserts,
1010
+ /* Extract*/ false , CostKind);
983
1011
984
1012
// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
985
1013
if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32 )
@@ -991,8 +1019,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
991
1019
if (Opcode == Instruction::FPTrunc) {
992
1020
if (SrcScalarBits == 128 ) // fp128 -> double/float + inserts of elements.
993
1021
return VF /* ldxbr/lexbr*/ +
994
- getScalarizationOverhead (DstVecTy, /* Insert*/ true ,
995
- /* Extract*/ false , CostKind);
1022
+ BaseT:: getScalarizationOverhead (DstVecTy, /* Insert*/ true ,
1023
+ /* Extract*/ false , CostKind);
996
1024
else // double -> float
997
1025
return VF / 2 /* vledb*/ + std::max (1U , VF / 4 /* vperm*/ );
998
1026
}
@@ -1005,8 +1033,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1005
1033
return VF * 2 ;
1006
1034
}
1007
1035
// -> fp128. VF * lxdb/lxeb + extraction of elements.
1008
- return VF + getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
1009
- /* Extract*/ true , CostKind);
1036
+ return VF + BaseT:: getScalarizationOverhead (SrcVecTy, /* Insert*/ false ,
1037
+ /* Extract*/ true , CostKind);
1010
1038
}
1011
1039
}
1012
1040
@@ -1115,10 +1143,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1115
1143
TTI::TargetCostKind CostKind,
1116
1144
unsigned Index, Value *Op0,
1117
1145
Value *Op1) {
1118
- // vlvgp will insert two grs into a vector register, so only count half the
1119
- // number of instructions.
1120
- if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy (64 ))
1121
- return ((Index % 2 == 0 ) ? 1 : 0 );
1146
+ if (Opcode == Instruction::InsertElement) {
1147
+ // Vector Element Load.
1148
+ if (Op1 != nullptr && Op1->hasOneUse () && isa<LoadInst>(Op1))
1149
+ return 0 ;
1150
+
1151
+ // vlvgp will insert two grs into a vector register, so count half the
1152
+ // number of instructions as an estimate when we don't have the full
1153
+ // picture (as in getScalarizationOverhead()).
1154
+ if (Val->isIntOrIntVectorTy (64 ))
1155
+ return ((Index % 2 == 0 ) ? 1 : 0 );
1156
+ }
1122
1157
1123
1158
if (Opcode == Instruction::ExtractElement) {
1124
1159
int Cost = ((getScalarSizeInBits (Val) == 1 ) ? 2 /* +test-under-mask*/ : 1 );
0 commit comments