@@ -5014,6 +5014,42 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
5014
5014
return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
5015
5015
}
5016
5016
5017
+ /// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
5018
+ /// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
5019
+ /// instead of a scalar.
5020
+ static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI,
5021
+ Type *ScalarTy, VectorType *Ty,
5022
+ const APInt &DemandedElts,
5023
+ bool Insert, bool Extract,
5024
+ TTI::TargetCostKind CostKind,
5025
+ ArrayRef<Value *> VL = {}) {
5026
+ assert(!isa<ScalableVectorType>(Ty) &&
5027
+ "ScalableVectorType is not supported.");
5028
+ assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
5029
+ getNumElements(Ty) &&
5030
+ "Incorrect usage.");
5031
+ if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
5032
+ assert(SLPReVec && "Only supported by REVEC.");
5033
+ // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
5034
+ // of CreateInsertElement.
5035
+ unsigned ScalarTyNumElements = VecTy->getNumElements();
5036
+ InstructionCost Cost = 0;
5037
+ for (unsigned I : seq(DemandedElts.getBitWidth())) {
5038
+ if (!DemandedElts[I])
5039
+ continue;
5040
+ if (Insert)
5041
+ Cost += getShuffleCost(TTI, TTI::SK_InsertSubvector, Ty, {}, CostKind,
5042
+ I * ScalarTyNumElements, VecTy);
5043
+ if (Extract)
5044
+ Cost += getShuffleCost(TTI, TTI::SK_ExtractSubvector, Ty, {}, CostKind,
5045
+ I * ScalarTyNumElements, VecTy);
5046
+ }
5047
+ return Cost;
5048
+ }
5049
+ return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
5050
+ CostKind, VL);
5051
+ }
5052
+
5017
5053
/// Correctly creates insert_subvector, checking that the index is multiple of
5018
5054
/// the subvectors length. Otherwise, generates shuffle using \p Generator or
5019
5055
/// using default shuffle.
@@ -5207,22 +5243,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
5207
5243
Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
5208
5244
// Estimate the cost of masked gather GEP. If not a splat, roughly
5209
5245
// estimate as a buildvector, otherwise estimate as splat.
5210
- APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5211
- VectorType *PtrVecTy =
5212
- getWidenedType(PointerOps.front()->getType()->getScalarType(),
5213
- VecTy->getNumElements());
5246
+ APInt DemandedElts = APInt::getAllOnes(Sz);
5247
+ Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
5248
+ VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
5214
5249
if (static_cast<unsigned>(count_if(
5215
5250
PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5216
5251
any_of(PointerOps, [&](Value *V) {
5217
5252
return getUnderlyingObject(V) !=
5218
5253
getUnderlyingObject(PointerOps.front());
5219
5254
}))
5220
- VectorGEPCost += TTI.getScalarizationOverhead(
5221
- PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5255
+ VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
5256
+ DemandedElts, /*Insert=*/true,
5257
+ /*Extract=*/false, CostKind);
5222
5258
else
5223
5259
VectorGEPCost +=
5224
- TTI. getScalarizationOverhead(
5225
- PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements() , 0),
5260
+ getScalarizationOverhead(
5261
+ TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz , 0),
5226
5262
/*Insert=*/true, /*Extract=*/false, CostKind) +
5227
5263
::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
5228
5264
// The cost of scalar loads.
@@ -5240,8 +5276,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
5240
5276
/*VariableMask=*/false, CommonAlignment, CostKind) +
5241
5277
(ProfitableGatherPointers ? 0 : VectorGEPCost);
5242
5278
InstructionCost GatherCost =
5243
- TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5244
- /*Extract=*/false, CostKind) +
5279
+ getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
5280
+ /*Insert=*/true,
5281
+ /*Extract=*/false, CostKind) +
5245
5282
ScalarLoadsCost;
5246
5283
// The list of loads is small or perform partial check already - directly
5247
5284
// compare masked gather cost and gather cost.
@@ -5294,16 +5331,15 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
5294
5331
// Can be vectorized later as a serie of loads/insertelements.
5295
5332
InstructionCost VecLdCost = 0;
5296
5333
if (!DemandedElts.isZero()) {
5297
- VecLdCost =
5298
- TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5299
- /*Extract=*/false, CostKind) +
5300
- ScalarGEPCost;
5334
+ VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
5335
+ /*Insert=*/true,
5336
+ /*Extract=*/false, CostKind) +
5337
+ ScalarGEPCost;
5301
5338
for (unsigned Idx : seq<unsigned>(VL.size()))
5302
5339
if (DemandedElts[Idx])
5303
5340
VecLdCost +=
5304
5341
TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
5305
5342
}
5306
- unsigned ScalarTyNumElements = getNumElements(ScalarTy);
5307
5343
auto *SubVecTy = getWidenedType(ScalarTy, VF);
5308
5344
for (auto [I, LS] : enumerate(States)) {
5309
5345
auto *LI0 = cast<LoadInst>(VL[I * VF]);
@@ -5323,13 +5359,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
5323
5359
return getUnderlyingObject(V) !=
5324
5360
getUnderlyingObject(PointerOps.front());
5325
5361
}))
5326
- VectorGEPCost += TTI. getScalarizationOverhead(
5327
- SubVecTy, APInt::getAllOnes(VF),
5362
+ VectorGEPCost += getScalarizationOverhead(
5363
+ TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
5328
5364
/*Insert=*/true, /*Extract=*/false, CostKind);
5329
5365
else
5330
5366
VectorGEPCost +=
5331
- TTI. getScalarizationOverhead(
5332
- SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5367
+ getScalarizationOverhead(
5368
+ TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
5333
5369
/*Insert=*/true, /*Extract=*/false, CostKind) +
5334
5370
::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
5335
5371
CostKind);
@@ -9912,20 +9948,9 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9912
9948
Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
9913
9949
Idx, getWidenedType(ScalarTy, Sz));
9914
9950
}
9915
- if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9916
- assert(SLPReVec && "Only supported by REVEC.");
9917
- // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9918
- // of CreateInsertElement.
9919
- unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9920
- for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9921
- if (DemandedElts[I])
9922
- Cost +=
9923
- TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9924
- CostKind, I * ScalarTyNumElements, FTy);
9925
- } else {
9926
- Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9927
- /*Extract=*/false, CostKind);
9928
- }
9951
+ Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
9952
+ /*Insert=*/true,
9953
+ /*Extract=*/false, CostKind);
9929
9954
int Sz = TE.Scalars.size();
9930
9955
SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9931
9956
TE.ReorderIndices.end());
@@ -9942,7 +9967,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9942
9967
? TTI::SK_PermuteTwoSrc
9943
9968
: TTI::SK_PermuteSingleSrc,
9944
9969
VecTy, ReorderMask);
9945
- DemandedElts = APInt::getAllOnes(VecTy->getNumElements ());
9970
+ DemandedElts = APInt::getAllOnes(TE.Scalars.size ());
9946
9971
ReorderMask.assign(Sz, PoisonMaskElem);
9947
9972
for (unsigned I : seq<unsigned>(Sz)) {
9948
9973
Value *V = TE.getOrdered(I);
@@ -9954,8 +9979,9 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9954
9979
ReorderMask[I] = I + Sz;
9955
9980
}
9956
9981
}
9957
- InstructionCost BVCost = TTI->getScalarizationOverhead(
9958
- VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9982
+ InstructionCost BVCost =
9983
+ getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
9984
+ /*Insert=*/true, /*Extract=*/false, CostKind);
9959
9985
if (!DemandedElts.isAllOnes())
9960
9986
BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9961
9987
if (Cost >= BVCost) {
@@ -11603,9 +11629,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
11603
11629
assert(Offset < NumElts && "Failed to find vector index offset");
11604
11630
11605
11631
InstructionCost Cost = 0;
11606
- Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11607
- /*Insert*/ true, /*Extract*/ false ,
11608
- CostKind);
11632
+ Cost -=
11633
+ getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts ,
11634
+ /*Insert*/ true, /*Extract*/ false, CostKind);
11609
11635
11610
11636
// First cost - resize to actual vector size if not identity shuffle or
11611
11637
// need to shift the vector.
@@ -13780,8 +13806,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
13780
13806
}
13781
13807
if (!IsIdentity)
13782
13808
FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13783
- FirstShuffleCost += TTI-> getScalarizationOverhead(
13784
- MaskVecTy, DemandedElts, /*Insert=*/true,
13809
+ FirstShuffleCost += getScalarizationOverhead(
13810
+ *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
13785
13811
/*Extract=*/false, CostKind);
13786
13812
}
13787
13813
InstructionCost SecondShuffleCost = 0;
@@ -13805,17 +13831,17 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
13805
13831
}
13806
13832
if (!IsIdentity)
13807
13833
SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13808
- SecondShuffleCost += TTI-> getScalarizationOverhead(
13809
- MaskVecTy, DemandedElts, /*Insert=*/true,
13834
+ SecondShuffleCost += getScalarizationOverhead(
13835
+ *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
13810
13836
/*Extract=*/false, CostKind);
13811
13837
}
13812
13838
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13813
13839
for (auto [I, Idx] : enumerate(SubMask))
13814
13840
if (Idx == PoisonMaskElem)
13815
13841
DemandedElts.clearBit(I);
13816
- InstructionCost BuildVectorCost =
13817
- TTI->getScalarizationOverhead( MaskVecTy, DemandedElts, /*Insert=*/true,
13818
- /*Extract=*/false, CostKind);
13842
+ InstructionCost BuildVectorCost = getScalarizationOverhead(
13843
+ * TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
13844
+ /*Extract=*/false, CostKind);
13819
13845
const TreeEntry *BestEntry = nullptr;
13820
13846
if (FirstShuffleCost < ShuffleCost) {
13821
13847
std::for_each(std::next(Mask.begin(), Part * VL.size()),
@@ -13968,45 +13994,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
13968
13994
ShuffledElements.setBit(I);
13969
13995
ShuffleMask[I] = Res.first->second;
13970
13996
}
13971
- if (!DemandedElements.isZero()) {
13972
- if (isa<FixedVectorType>(ScalarTy)) {
13973
- assert(SLPReVec && "Only supported by REVEC.");
13974
- // We don't need to insert elements one by one. Instead, we can insert the
13975
- // entire vector into the destination.
13976
- Cost = 0;
13977
- unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13978
- for (unsigned I : seq<unsigned>(VL.size()))
13979
- if (DemandedElements[I])
13980
- Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {},
13981
- CostKind, I * ScalarTyNumElements,
13982
- cast<FixedVectorType>(ScalarTy));
13983
- } else {
13984
- Cost += TTI->getScalarizationOverhead(VecTy, DemandedElements,
13985
- /*Insert=*/true,
13986
- /*Extract=*/false, CostKind, VL);
13987
- }
13988
- }
13989
- if (ForPoisonSrc) {
13990
- if (isa<FixedVectorType>(ScalarTy)) {
13991
- assert(SLPReVec && "Only supported by REVEC.");
13992
- // We don't need to insert elements one by one. Instead, we can insert the
13993
- // entire vector into the destination.
13994
- assert(DemandedElements.isZero() &&
13995
- "Need to consider the cost from DemandedElements.");
13996
- Cost = 0;
13997
- unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13998
- for (unsigned I : seq<unsigned>(VL.size()))
13999
- if (!ShuffledElements[I])
14000
- Cost += TTI->getShuffleCost(
14001
- TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
14002
- I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
14003
- } else {
14004
- Cost = TTI->getScalarizationOverhead(VecTy,
14005
- /*DemandedElts*/ ~ShuffledElements,
14006
- /*Insert*/ true,
14007
- /*Extract*/ false, CostKind, VL);
14008
- }
14009
- }
13997
+ if (!DemandedElements.isZero())
13998
+ Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
13999
+ /*Insert=*/true,
14000
+ /*Extract=*/false, CostKind, VL);
14001
+ if (ForPoisonSrc)
14002
+ Cost = getScalarizationOverhead(*TTI, ScalarTy, VecTy,
14003
+ /*DemandedElts*/ ~ShuffledElements,
14004
+ /*Insert*/ true,
14005
+ /*Extract*/ false, CostKind, VL);
14010
14006
if (DuplicateNonConst)
14011
14007
Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
14012
14008
VecTy, ShuffleMask);
0 commit comments