Skip to content

Commit a12ca57

Browse files
authored
[SLP][REVEC] Add getScalarizationOverhead helper function to reduce error when REVEC is enabled. (llvm#128530)
1 parent 4f7d894 commit a12ca57

File tree

1 file changed

+81
-85
lines changed

1 file changed

+81
-85
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 81 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -5014,6 +5014,42 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
50145014
return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
50155015
}
50165016

5017+
/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
5018+
/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
5019+
/// instead of a scalar.
5020+
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI,
5021+
Type *ScalarTy, VectorType *Ty,
5022+
const APInt &DemandedElts,
5023+
bool Insert, bool Extract,
5024+
TTI::TargetCostKind CostKind,
5025+
ArrayRef<Value *> VL = {}) {
5026+
assert(!isa<ScalableVectorType>(Ty) &&
5027+
"ScalableVectorType is not supported.");
5028+
assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
5029+
getNumElements(Ty) &&
5030+
"Incorrect usage.");
5031+
if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
5032+
assert(SLPReVec && "Only supported by REVEC.");
5033+
// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
5034+
// of CreateInsertElement.
5035+
unsigned ScalarTyNumElements = VecTy->getNumElements();
5036+
InstructionCost Cost = 0;
5037+
for (unsigned I : seq(DemandedElts.getBitWidth())) {
5038+
if (!DemandedElts[I])
5039+
continue;
5040+
if (Insert)
5041+
Cost += getShuffleCost(TTI, TTI::SK_InsertSubvector, Ty, {}, CostKind,
5042+
I * ScalarTyNumElements, VecTy);
5043+
if (Extract)
5044+
Cost += getShuffleCost(TTI, TTI::SK_ExtractSubvector, Ty, {}, CostKind,
5045+
I * ScalarTyNumElements, VecTy);
5046+
}
5047+
return Cost;
5048+
}
5049+
return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
5050+
CostKind, VL);
5051+
}
5052+
50175053
/// Correctly creates insert_subvector, checking that the index is multiple of
50185054
/// the subvectors length. Otherwise, generates shuffle using \p Generator or
50195055
/// using default shuffle.
@@ -5207,22 +5243,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
52075243
Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
52085244
// Estimate the cost of masked gather GEP. If not a splat, roughly
52095245
// estimate as a buildvector, otherwise estimate as splat.
5210-
APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5211-
VectorType *PtrVecTy =
5212-
getWidenedType(PointerOps.front()->getType()->getScalarType(),
5213-
VecTy->getNumElements());
5246+
APInt DemandedElts = APInt::getAllOnes(Sz);
5247+
Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
5248+
VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
52145249
if (static_cast<unsigned>(count_if(
52155250
PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
52165251
any_of(PointerOps, [&](Value *V) {
52175252
return getUnderlyingObject(V) !=
52185253
getUnderlyingObject(PointerOps.front());
52195254
}))
5220-
VectorGEPCost += TTI.getScalarizationOverhead(
5221-
PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5255+
VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
5256+
DemandedElts, /*Insert=*/true,
5257+
/*Extract=*/false, CostKind);
52225258
else
52235259
VectorGEPCost +=
5224-
TTI.getScalarizationOverhead(
5225-
PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
5260+
getScalarizationOverhead(
5261+
TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
52265262
/*Insert=*/true, /*Extract=*/false, CostKind) +
52275263
::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
52285264
// The cost of scalar loads.
@@ -5240,8 +5276,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
52405276
/*VariableMask=*/false, CommonAlignment, CostKind) +
52415277
(ProfitableGatherPointers ? 0 : VectorGEPCost);
52425278
InstructionCost GatherCost =
5243-
TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5244-
/*Extract=*/false, CostKind) +
5279+
getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
5280+
/*Insert=*/true,
5281+
/*Extract=*/false, CostKind) +
52455282
ScalarLoadsCost;
52465283
// The list of loads is small or perform partial check already - directly
52475284
// compare masked gather cost and gather cost.
@@ -5294,16 +5331,15 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
52945331
// Can be vectorized later as a serie of loads/insertelements.
52955332
InstructionCost VecLdCost = 0;
52965333
if (!DemandedElts.isZero()) {
5297-
VecLdCost =
5298-
TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5299-
/*Extract=*/false, CostKind) +
5300-
ScalarGEPCost;
5334+
VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
5335+
/*Insert=*/true,
5336+
/*Extract=*/false, CostKind) +
5337+
ScalarGEPCost;
53015338
for (unsigned Idx : seq<unsigned>(VL.size()))
53025339
if (DemandedElts[Idx])
53035340
VecLdCost +=
53045341
TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
53055342
}
5306-
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
53075343
auto *SubVecTy = getWidenedType(ScalarTy, VF);
53085344
for (auto [I, LS] : enumerate(States)) {
53095345
auto *LI0 = cast<LoadInst>(VL[I * VF]);
@@ -5323,13 +5359,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
53235359
return getUnderlyingObject(V) !=
53245360
getUnderlyingObject(PointerOps.front());
53255361
}))
5326-
VectorGEPCost += TTI.getScalarizationOverhead(
5327-
SubVecTy, APInt::getAllOnes(VF),
5362+
VectorGEPCost += getScalarizationOverhead(
5363+
TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
53285364
/*Insert=*/true, /*Extract=*/false, CostKind);
53295365
else
53305366
VectorGEPCost +=
5331-
TTI.getScalarizationOverhead(
5332-
SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5367+
getScalarizationOverhead(
5368+
TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
53335369
/*Insert=*/true, /*Extract=*/false, CostKind) +
53345370
::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
53355371
CostKind);
@@ -9912,20 +9948,9 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
99129948
Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
99139949
Idx, getWidenedType(ScalarTy, Sz));
99149950
}
9915-
if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9916-
assert(SLPReVec && "Only supported by REVEC.");
9917-
// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9918-
// of CreateInsertElement.
9919-
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9920-
for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9921-
if (DemandedElts[I])
9922-
Cost +=
9923-
TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9924-
CostKind, I * ScalarTyNumElements, FTy);
9925-
} else {
9926-
Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9927-
/*Extract=*/false, CostKind);
9928-
}
9951+
Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
9952+
/*Insert=*/true,
9953+
/*Extract=*/false, CostKind);
99299954
int Sz = TE.Scalars.size();
99309955
SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
99319956
TE.ReorderIndices.end());
@@ -9942,7 +9967,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
99429967
? TTI::SK_PermuteTwoSrc
99439968
: TTI::SK_PermuteSingleSrc,
99449969
VecTy, ReorderMask);
9945-
DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
9970+
DemandedElts = APInt::getAllOnes(TE.Scalars.size());
99469971
ReorderMask.assign(Sz, PoisonMaskElem);
99479972
for (unsigned I : seq<unsigned>(Sz)) {
99489973
Value *V = TE.getOrdered(I);
@@ -9954,8 +9979,9 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
99549979
ReorderMask[I] = I + Sz;
99559980
}
99569981
}
9957-
InstructionCost BVCost = TTI->getScalarizationOverhead(
9958-
VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9982+
InstructionCost BVCost =
9983+
getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
9984+
/*Insert=*/true, /*Extract=*/false, CostKind);
99599985
if (!DemandedElts.isAllOnes())
99609986
BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
99619987
if (Cost >= BVCost) {
@@ -11603,9 +11629,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1160311629
assert(Offset < NumElts && "Failed to find vector index offset");
1160411630

1160511631
InstructionCost Cost = 0;
11606-
Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11607-
/*Insert*/ true, /*Extract*/ false,
11608-
CostKind);
11632+
Cost -=
11633+
getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
11634+
/*Insert*/ true, /*Extract*/ false, CostKind);
1160911635

1161011636
// First cost - resize to actual vector size if not identity shuffle or
1161111637
// need to shift the vector.
@@ -13780,8 +13806,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1378013806
}
1378113807
if (!IsIdentity)
1378213808
FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13783-
FirstShuffleCost += TTI->getScalarizationOverhead(
13784-
MaskVecTy, DemandedElts, /*Insert=*/true,
13809+
FirstShuffleCost += getScalarizationOverhead(
13810+
*TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
1378513811
/*Extract=*/false, CostKind);
1378613812
}
1378713813
InstructionCost SecondShuffleCost = 0;
@@ -13805,17 +13831,17 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1380513831
}
1380613832
if (!IsIdentity)
1380713833
SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13808-
SecondShuffleCost += TTI->getScalarizationOverhead(
13809-
MaskVecTy, DemandedElts, /*Insert=*/true,
13834+
SecondShuffleCost += getScalarizationOverhead(
13835+
*TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
1381013836
/*Extract=*/false, CostKind);
1381113837
}
1381213838
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
1381313839
for (auto [I, Idx] : enumerate(SubMask))
1381413840
if (Idx == PoisonMaskElem)
1381513841
DemandedElts.clearBit(I);
13816-
InstructionCost BuildVectorCost =
13817-
TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13818-
/*Extract=*/false, CostKind);
13842+
InstructionCost BuildVectorCost = getScalarizationOverhead(
13843+
*TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
13844+
/*Extract=*/false, CostKind);
1381913845
const TreeEntry *BestEntry = nullptr;
1382013846
if (FirstShuffleCost < ShuffleCost) {
1382113847
std::for_each(std::next(Mask.begin(), Part * VL.size()),
@@ -13968,45 +13994,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1396813994
ShuffledElements.setBit(I);
1396913995
ShuffleMask[I] = Res.first->second;
1397013996
}
13971-
if (!DemandedElements.isZero()) {
13972-
if (isa<FixedVectorType>(ScalarTy)) {
13973-
assert(SLPReVec && "Only supported by REVEC.");
13974-
// We don't need to insert elements one by one. Instead, we can insert the
13975-
// entire vector into the destination.
13976-
Cost = 0;
13977-
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13978-
for (unsigned I : seq<unsigned>(VL.size()))
13979-
if (DemandedElements[I])
13980-
Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {},
13981-
CostKind, I * ScalarTyNumElements,
13982-
cast<FixedVectorType>(ScalarTy));
13983-
} else {
13984-
Cost += TTI->getScalarizationOverhead(VecTy, DemandedElements,
13985-
/*Insert=*/true,
13986-
/*Extract=*/false, CostKind, VL);
13987-
}
13988-
}
13989-
if (ForPoisonSrc) {
13990-
if (isa<FixedVectorType>(ScalarTy)) {
13991-
assert(SLPReVec && "Only supported by REVEC.");
13992-
// We don't need to insert elements one by one. Instead, we can insert the
13993-
// entire vector into the destination.
13994-
assert(DemandedElements.isZero() &&
13995-
"Need to consider the cost from DemandedElements.");
13996-
Cost = 0;
13997-
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13998-
for (unsigned I : seq<unsigned>(VL.size()))
13999-
if (!ShuffledElements[I])
14000-
Cost += TTI->getShuffleCost(
14001-
TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
14002-
I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
14003-
} else {
14004-
Cost = TTI->getScalarizationOverhead(VecTy,
14005-
/*DemandedElts*/ ~ShuffledElements,
14006-
/*Insert*/ true,
14007-
/*Extract*/ false, CostKind, VL);
14008-
}
14009-
}
13997+
if (!DemandedElements.isZero())
13998+
Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
13999+
/*Insert=*/true,
14000+
/*Extract=*/false, CostKind, VL);
14001+
if (ForPoisonSrc)
14002+
Cost = getScalarizationOverhead(*TTI, ScalarTy, VecTy,
14003+
/*DemandedElts*/ ~ShuffledElements,
14004+
/*Insert*/ true,
14005+
/*Extract*/ false, CostKind, VL);
1401014006
if (DuplicateNonConst)
1401114007
Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
1401214008
VecTy, ShuffleMask);

0 commit comments

Comments
 (0)