Skip to content

Commit 2faacf6

Browse files
committed
[SLP]Improve shuffles cost estimation where possible.
Improved/fixed cost modeling for shuffles by providing masks, improved cost model for non-identity insertelements. Differential Revision: https://reviews.llvm.org/D115462
1 parent 146f486 commit 2faacf6

12 files changed

+319
-303
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 84 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5685,17 +5685,17 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
56855685
for (auto *V : VL) {
56865686
++Idx;
56875687

5688-
// Need to exclude undefs from analysis.
5689-
if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
5690-
continue;
5691-
56925688
// Reached the start of a new vector registers.
56935689
if (Idx % EltsPerVector == 0) {
56945690
RegMask.assign(EltsPerVector, UndefMaskElem);
56955691
AllConsecutive = true;
56965692
continue;
56975693
}
56985694

5695+
// Need to exclude undefs from analysis.
5696+
if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
5697+
continue;
5698+
56995699
// Check all extracts for a vector register on the target directly
57005700
// extract values in order.
57015701
unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
@@ -6145,61 +6145,102 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
61456145
assert(E->ReuseShuffleIndices.empty() &&
61466146
"Unique insertelements only are expected.");
61476147
auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
6148-
61496148
unsigned const NumElts = SrcVecTy->getNumElements();
61506149
unsigned const NumScalars = VL.size();
6150+
6151+
unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
6152+
6153+
unsigned OffsetBeg = *getInsertIndex(VL.front());
6154+
unsigned OffsetEnd = OffsetBeg;
6155+
for (Value *V : VL.drop_front()) {
6156+
unsigned Idx = *getInsertIndex(V);
6157+
if (OffsetBeg > Idx)
6158+
OffsetBeg = Idx;
6159+
else if (OffsetEnd < Idx)
6160+
OffsetEnd = Idx;
6161+
}
6162+
unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
6163+
if (NumOfParts > 0)
6164+
VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
6165+
unsigned VecSz =
6166+
(1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
6167+
VecScalarsSz;
6168+
unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
6169+
unsigned InsertVecSz = std::min<unsigned>(
6170+
PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
6171+
((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) *
6172+
VecScalarsSz);
6173+
bool IsWholeSubvector =
6174+
OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
6175+
// Check if we can safely insert a subvector. If it is not possible, just
6176+
// generate a whole-sized vector and shuffle the source vector and the new
6177+
// subvector.
6178+
if (OffsetBeg + InsertVecSz > VecSz) {
6179+
// Align OffsetBeg to generate correct mask.
6180+
OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
6181+
InsertVecSz = VecSz;
6182+
}
6183+
61516184
APInt DemandedElts = APInt::getZero(NumElts);
61526185
// TODO: Add support for Instruction::InsertValue.
61536186
SmallVector<int> Mask;
61546187
if (!E->ReorderIndices.empty()) {
61556188
inversePermutation(E->ReorderIndices, Mask);
6156-
Mask.append(NumElts - NumScalars, UndefMaskElem);
6189+
Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);
61576190
} else {
6158-
Mask.assign(NumElts, UndefMaskElem);
6159-
std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
6191+
Mask.assign(VecSz, UndefMaskElem);
6192+
std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
61606193
}
6161-
unsigned Offset = *getInsertIndex(VL0);
61626194
bool IsIdentity = true;
6163-
SmallVector<int> PrevMask(NumElts, UndefMaskElem);
6195+
SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);
61646196
Mask.swap(PrevMask);
61656197
for (unsigned I = 0; I < NumScalars; ++I) {
61666198
unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
61676199
DemandedElts.setBit(InsertIdx);
6168-
IsIdentity &= InsertIdx - Offset == I;
6169-
Mask[InsertIdx - Offset] = I;
6200+
IsIdentity &= InsertIdx - OffsetBeg == I;
6201+
Mask[InsertIdx - OffsetBeg] = I;
61706202
}
61716203
assert(Offset < NumElts && "Failed to find vector index offset");
61726204

61736205
InstructionCost Cost = 0;
61746206
Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
61756207
/*Insert*/ true, /*Extract*/ false);
61766208

6177-
if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
6178-
// FIXME: Replace with SK_InsertSubvector once it is properly supported.
6179-
unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
6180-
Cost += TTI->getShuffleCost(
6181-
TargetTransformInfo::SK_PermuteSingleSrc,
6182-
FixedVectorType::get(SrcVecTy->getElementType(), Sz));
6183-
} else if (!IsIdentity) {
6184-
auto *FirstInsert =
6185-
cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
6186-
return !is_contained(E->Scalars,
6187-
cast<Instruction>(V)->getOperand(0));
6188-
}));
6189-
if (isUndefVector(FirstInsert->getOperand(0))) {
6190-
Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
6209+
// First cost - resize to actual vector size if not identity shuffle or
6210+
// need to shift the vector.
6211+
// Do not calculate the cost if the actual size is the register size and
6212+
// we can merge this shuffle with the following SK_Select.
6213+
auto *InsertVecTy =
6214+
FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);
6215+
if (!IsIdentity)
6216+
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
6217+
InsertVecTy, Mask);
6218+
auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
6219+
return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
6220+
}));
6221+
// Second cost - permutation with subvector, if some elements are from the
6222+
// initial vector or inserting a subvector.
6223+
// TODO: Implement the analysis of the FirstInsert->getOperand(0)
6224+
// subvector of ActualVecTy.
6225+
if (!isUndefVector(FirstInsert->getOperand(0)) && NumScalars != NumElts &&
6226+
!IsWholeSubvector) {
6227+
if (InsertVecSz != VecSz) {
6228+
auto *ActualVecTy =
6229+
FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
6230+
Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
6231+
None, OffsetBeg - Offset, InsertVecTy);
61916232
} else {
6192-
SmallVector<int> InsertMask(NumElts);
6193-
std::iota(InsertMask.begin(), InsertMask.end(), 0);
6194-
for (unsigned I = 0; I < NumElts; I++) {
6233+
for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
6234+
Mask[I] = I;
6235+
for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
6236+
I <= End; ++I)
61956237
if (Mask[I] != UndefMaskElem)
6196-
InsertMask[Offset + I] = NumElts + I;
6197-
}
6198-
Cost +=
6199-
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);
6238+
Mask[I] = I + VecSz;
6239+
for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
6240+
Mask[I] = I;
6241+
Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
62006242
}
62016243
}
6202-
62036244
return Cost;
62046245
}
62056246
case Instruction::ZExt:
@@ -6659,7 +6700,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
66596700
// No need to vectorize inserts of gathered values.
66606701
if (VectorizableTree.size() == 2 &&
66616702
isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
6662-
VectorizableTree[1]->State == TreeEntry::NeedToGather)
6703+
VectorizableTree[1]->State == TreeEntry::NeedToGather &&
6704+
(VectorizableTree[1]->getVectorFactor() <= 2 ||
6705+
!(isSplat(VectorizableTree[1]->Scalars) ||
6706+
allConstant(VectorizableTree[1]->Scalars))))
66636707
return true;
66646708

66656709
// We can vectorize the tree if its size is greater than or equal to the
@@ -7693,6 +7737,11 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
76937737
return createBuildVector(VL);
76947738
}
76957739
Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) {
7740+
assert(any_of(VectorizableTree,
7741+
[VL](const std::unique_ptr<TreeEntry> &TE) {
7742+
return TE->State == TreeEntry::NeedToGather && TE->isSame(VL);
7743+
}) &&
7744+
"Non-matching gather node.");
76967745
unsigned VF = VL.size();
76977746
// Exploit possible reuse of values across lanes.
76987747
SmallVector<int> ReuseShuffleIndicies;

0 commit comments

Comments
 (0)