Skip to content

Commit 279b1ea

Browse files
committed
[SLP]Improve gathering of the scalars used in the graph.
Currently we emit gathers for scalars being vectorized in the tree as a pair of extractelement/insertelement instructions. Instead we can try to find all required vectors and emit shuffle vector instructions directly, improving the code and reducing compile time. Part of non-power-of-2 vectorization. Differential Revision: https://reviews.llvm.org/D110978
1 parent 7e7aaa5 commit 279b1ea

File tree

2 files changed

+67
-268
lines changed

2 files changed

+67
-268
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 46 additions & 225 deletions
Original file line numberDiff line numberDiff line change
@@ -436,26 +436,6 @@ static SmallBitVector isUndefVector(const Value *V,
436436
/// i32 6>
437437
/// %2 = mul <4 x i8> %1, %1
438438
/// ret <4 x i8> %2
439-
/// We convert this initially to something like:
440-
/// %x0 = extractelement <4 x i8> %x, i32 0
441-
/// %x3 = extractelement <4 x i8> %x, i32 3
442-
/// %y1 = extractelement <4 x i8> %y, i32 1
443-
/// %y2 = extractelement <4 x i8> %y, i32 2
444-
/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
445-
/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
446-
/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
447-
/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
448-
/// %5 = mul <4 x i8> %4, %4
449-
/// %6 = extractelement <4 x i8> %5, i32 0
450-
/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
451-
/// %7 = extractelement <4 x i8> %5, i32 1
452-
/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
453-
/// %8 = extractelement <4 x i8> %5, i32 2
454-
/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
455-
/// %9 = extractelement <4 x i8> %5, i32 3
456-
/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
457-
/// ret <4 x i8> %ins4
458-
/// InstCombiner transforms this into a shuffle and vector mul
459439
/// Mask will return the Shuffle Mask equivalent to the extracted elements.
460440
/// TODO: Can we split off and reuse the shuffle mask detection from
461441
/// ShuffleVectorInst/getShuffleCost?
@@ -7505,6 +7485,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
75057485
}
75067486
return VecBase;
75077487
}
7488+
/// Checks if the specified entry \p E needs to be delayed because of its
7489+
/// dependency nodes.
7490+
std::optional<InstructionCost>
7491+
needToDelay(const TreeEntry *,
7492+
ArrayRef<SmallVector<const TreeEntry *>>) const {
7493+
// No need to delay the cost estimation during analysis.
7494+
return std::nullopt;
7495+
}
75087496
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
75097497
if (&E1 == &E2) {
75107498
assert(all_of(Mask,
@@ -7619,13 +7607,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
76197607
if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
76207608
CommonMask[Idx] = Mask[Idx] + VF;
76217609
}
7622-
Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
7610+
Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
7611+
Value *Root = nullptr) {
76237612
Cost += getBuildVectorCost(VL, Root);
76247613
if (!Root) {
7625-
assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
76267614
// FIXME: Need to find a way to avoid use of getNullValue here.
76277615
SmallVector<Constant *> Vals;
7628-
for (Value *V : VL) {
7616+
unsigned VF = VL.size();
7617+
if (MaskVF != 0)
7618+
VF = std::min(VF, MaskVF);
7619+
for (Value *V : VL.take_front(VF)) {
76297620
if (isa<UndefValue>(V)) {
76307621
Vals.push_back(cast<Constant>(V));
76317622
continue;
@@ -7635,9 +7626,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
76357626
return ConstantVector::get(Vals);
76367627
}
76377628
return ConstantVector::getSplat(
7638-
ElementCount::getFixed(VL.size()),
7629+
ElementCount::getFixed(
7630+
cast<FixedVectorType>(Root->getType())->getNumElements()),
76397631
getAllOnesValue(*R.DL, VL.front()->getType()));
76407632
}
7633+
InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
76417634
/// Finalize emission of the shuffles.
76427635
InstructionCost
76437636
finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
@@ -7659,8 +7652,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
76597652
InVectors.front() = V;
76607653
}
76617654
::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
7662-
if (CommonMask.empty())
7655+
if (CommonMask.empty()) {
7656+
assert(InVectors.size() == 1 && "Expected only one vector with no mask");
76637657
return Cost;
7658+
}
76647659
return Cost +
76657660
createShuffle(InVectors.front(),
76667661
InVectors.size() == 2 ? InVectors.back() : nullptr,
@@ -7737,189 +7732,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
77377732
return 0;
77387733
if (isa<InsertElementInst>(VL[0]))
77397734
return InstructionCost::getInvalid();
7740-
ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this,
7741-
CheckedExtracts);
7742-
unsigned VF = E->getVectorFactor();
7743-
SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
7744-
E->ReuseShuffleIndices.end());
7745-
SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
7746-
// Build a mask out of the reorder indices and reorder scalars per this
7747-
// mask.
7748-
SmallVector<int> ReorderMask;
7749-
inversePermutation(E->ReorderIndices, ReorderMask);
7750-
if (!ReorderMask.empty())
7751-
reorderScalars(GatheredScalars, ReorderMask);
7752-
SmallVector<int> Mask;
7753-
SmallVector<int> ExtractMask;
7754-
Value *ExtractVecBase = nullptr;
7755-
bool UseVecBaseAsInput = false;
7756-
SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
7757-
SmallVector<SmallVector<const TreeEntry *>> Entries;
7758-
SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
7759-
// Check for gathered extracts.
7760-
bool Resized = false;
7761-
unsigned NumParts = TTI->getNumberOfParts(VecTy);
7762-
if (NumParts == 0 || NumParts >= GatheredScalars.size())
7763-
NumParts = 1;
7764-
if (!all_of(GatheredScalars, UndefValue::classof)) {
7765-
ExtractShuffles =
7766-
tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
7767-
if (!ExtractShuffles.empty()) {
7768-
if (Value *VecBase = Estimator.adjustExtracts(
7769-
E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
7770-
if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
7771-
if (VF == VecBaseTy->getNumElements() &&
7772-
GatheredScalars.size() != VF) {
7773-
Resized = true;
7774-
GatheredScalars.append(VF - GatheredScalars.size(),
7775-
PoisonValue::get(ScalarTy));
7776-
}
7777-
}
7778-
}
7779-
7780-
// Do not try to look for reshuffled loads for gathered loads (they will
7781-
// be handled later), for vectorized scalars, and cases, which are
7782-
// definitely not profitable (splats and small gather nodes.)
7783-
if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
7784-
E->isAltShuffle() ||
7785-
all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
7786-
isSplat(E->Scalars) ||
7787-
(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
7788-
GatherShuffles =
7789-
isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
7790-
}
7791-
if (!GatherShuffles.empty()) {
7792-
if (GatherShuffles.size() == 1 &&
7793-
*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
7794-
Entries.front().front()->isSame(E->Scalars)) {
7795-
// Perfect match in the graph, will reuse the previously vectorized
7796-
// node. Cost is 0.
7797-
LLVM_DEBUG(
7798-
dbgs()
7799-
<< "SLP: perfect diamond match for gather bundle "
7800-
<< shortBundleName(VL) << ".\n");
7801-
// Restore the mask for previous partially matched values.
7802-
Mask.resize(E->Scalars.size());
7803-
const TreeEntry *FrontTE = Entries.front().front();
7804-
if (FrontTE->ReorderIndices.empty() &&
7805-
((FrontTE->ReuseShuffleIndices.empty() &&
7806-
E->Scalars.size() == FrontTE->Scalars.size()) ||
7807-
(E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
7808-
std::iota(Mask.begin(), Mask.end(), 0);
7809-
} else {
7810-
for (auto [I, V] : enumerate(E->Scalars)) {
7811-
if (isa<PoisonValue>(V)) {
7812-
Mask[I] = PoisonMaskElem;
7813-
continue;
7814-
}
7815-
Mask[I] = FrontTE->findLaneForValue(V);
7816-
}
7817-
}
7818-
Estimator.add(*FrontTE, Mask);
7819-
return Estimator.finalize(E->getCommonMask());
7820-
}
7821-
if (!Resized) {
7822-
if (GatheredScalars.size() != VF &&
7823-
any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
7824-
return any_of(TEs, [&](const TreeEntry *TE) {
7825-
return TE->getVectorFactor() == VF;
7826-
});
7827-
}))
7828-
GatheredScalars.append(VF - GatheredScalars.size(),
7829-
PoisonValue::get(ScalarTy));
7830-
}
7831-
// Remove shuffled elements from list of gathers.
7832-
for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
7833-
if (Mask[I] != PoisonMaskElem)
7834-
GatheredScalars[I] = PoisonValue::get(ScalarTy);
7835-
}
7836-
LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
7837-
<< " entries for bundle "
7838-
<< shortBundleName(VL) << ".\n");
7839-
unsigned SliceSize = E->Scalars.size() / NumParts;
7840-
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
7841-
for (const auto [I, TEs] : enumerate(Entries)) {
7842-
if (TEs.empty()) {
7843-
assert(!GatherShuffles[I] &&
7844-
"No shuffles with empty entries list expected.");
7845-
continue;
7846-
}
7847-
assert((TEs.size() == 1 || TEs.size() == 2) &&
7848-
"Expected shuffle of 1 or 2 entries.");
7849-
auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
7850-
VecMask.assign(VecMask.size(), PoisonMaskElem);
7851-
copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
7852-
Estimator.add(*TEs.front(), *TEs.back(), VecMask);
7853-
}
7854-
if (all_of(GatheredScalars, PoisonValue ::classof))
7855-
return Estimator.finalize(E->ReuseShuffleIndices);
7856-
return Estimator.finalize(
7857-
E->ReuseShuffleIndices, E->Scalars.size(),
7858-
[&](Value *&Vec, SmallVectorImpl<int> &Mask) {
7859-
Vec = Estimator.gather(GatheredScalars,
7860-
Constant::getNullValue(FixedVectorType::get(
7861-
ScalarTy, GatheredScalars.size())));
7862-
});
7863-
}
7864-
if (!ExtractShuffles.empty()) {
7865-
Value *Vec1 = nullptr;
7866-
// Gather of extractelements can be represented as just a shuffle of
7867-
// a single/two vectors the scalars are extracted from.
7868-
// Find input vectors.
7869-
Value *Vec2 = nullptr;
7870-
for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
7871-
if (!Mask.empty() && Mask[I] != PoisonMaskElem)
7872-
ExtractMask[I] = PoisonMaskElem;
7873-
}
7874-
if (UseVecBaseAsInput) {
7875-
Vec1 = ExtractVecBase;
7876-
} else {
7877-
for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
7878-
if (ExtractMask[I] == PoisonMaskElem)
7879-
continue;
7880-
if (isa<UndefValue>(E->Scalars[I]))
7881-
continue;
7882-
auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
7883-
Value *VecOp = EI->getVectorOperand();
7884-
if (const auto *TE = getTreeEntry(VecOp))
7885-
if (TE->VectorizedValue)
7886-
VecOp = TE->VectorizedValue;
7887-
if (!Vec1) {
7888-
Vec1 = VecOp;
7889-
} else if (Vec1 != EI->getVectorOperand()) {
7890-
assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
7891-
"Expected only 1 or 2 vectors shuffle.");
7892-
Vec2 = VecOp;
7893-
}
7894-
}
7895-
}
7896-
if (Vec2) {
7897-
Estimator.add(Vec1, Vec2, ExtractMask);
7898-
} else if (Vec1) {
7899-
Estimator.add(Vec1, ExtractMask, /*ForExtracts=*/true);
7900-
} else {
7901-
Estimator.add(PoisonValue::get(FixedVectorType::get(
7902-
ScalarTy, GatheredScalars.size())),
7903-
ExtractMask, /*ForExtracts=*/true);
7904-
}
7905-
}
7906-
if (!all_of(GatheredScalars, PoisonValue::classof)) {
7907-
auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
7908-
bool SameGathers = VL.equals(Gathers);
7909-
if (!SameGathers)
7910-
return Estimator.finalize(
7911-
E->ReuseShuffleIndices, E->Scalars.size(),
7912-
[&](Value *&Vec, SmallVectorImpl<int> &Mask) {
7913-
Vec = Estimator.gather(
7914-
GatheredScalars, Constant::getNullValue(FixedVectorType::get(
7915-
ScalarTy, GatheredScalars.size())));
7916-
});
7917-
Value *BV = Estimator.gather(Gathers);
7918-
SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
7919-
std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
7920-
Estimator.add(BV, ReuseMask);
7921-
}
7922-
return Estimator.finalize(E->ReuseShuffleIndices);
7735+
return processBuildVector<ShuffleCostEstimator, InstructionCost>(
7736+
E, *TTI, VectorizedVals, *this, CheckedExtracts);
79237737
}
79247738
InstructionCost CommonCost = 0;
79257739
SmallVector<int> Mask;
@@ -10337,6 +10151,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1033710151

1033810152
/// Adjusts extractelements after reusing them.
1033910153
Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10154+
ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
1034010155
unsigned NumParts, bool &UseVecBaseAsInput) {
1034110156
UseVecBaseAsInput = false;
1034210157
SmallPtrSet<Value *, 4> UniqueBases;
@@ -10441,14 +10256,15 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1044110256
}
1044210257
/// Checks if the specified entry \p E needs to be delayed because of its
1044310258
/// dependency nodes.
10444-
Value *needToDelay(const TreeEntry *E,
10445-
ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
10259+
std::optional<Value *>
10260+
needToDelay(const TreeEntry *E,
10261+
ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
1044610262
// No need to delay emission if all deps are ready.
1044710263
if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
1044810264
return all_of(
1044910265
TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
1045010266
}))
10451-
return nullptr;
10267+
return std::nullopt;
1045210268
// Postpone gather emission, will be emitted after the end of the
1045310269
// process to keep correct order.
1045410270
auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
@@ -10558,7 +10374,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1055810374
inversePermutation(Order, NewMask);
1055910375
add(V1, NewMask);
1056010376
}
10561-
Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
10377+
Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10378+
Value *Root = nullptr) {
1056210379
return R.gather(VL, Root);
1056310380
}
1056410381
Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
@@ -10819,15 +10636,16 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
1081910636
cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
1082010637
ExtractEntries.push_back(TE);
1082110638
}
10822-
if (Value *Delayed = ShuffleBuilder.needToDelay(E, ExtractEntries)) {
10639+
if (std::optional<ResTy> Delayed =
10640+
ShuffleBuilder.needToDelay(E, ExtractEntries)) {
1082310641
// Delay emission of gathers which are not ready yet.
1082410642
PostponedGathers.insert(E);
1082510643
// Postpone gather emission, will be emitted after the end of the
1082610644
// process to keep correct order.
10827-
return Delayed;
10645+
return *Delayed;
1082810646
}
1082910647
if (Value *VecBase = ShuffleBuilder.adjustExtracts(
10830-
E, ExtractMask, NumParts, UseVecBaseAsInput)) {
10648+
E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
1083110649
ExtractVecBase = VecBase;
1083210650
if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
1083310651
if (VF == VecBaseTy->getNumElements() &&
@@ -10848,12 +10666,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
1084810666
isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
1084910667
}
1085010668
if (!GatherShuffles.empty()) {
10851-
if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
10669+
if (std::optional<ResTy> Delayed =
10670+
ShuffleBuilder.needToDelay(E, Entries)) {
1085210671
// Delay emission of gathers which are not ready yet.
1085310672
PostponedGathers.insert(E);
1085410673
// Postpone gather emission, will be emitted after the end of the
1085510674
// process to keep correct order.
10856-
return Delayed;
10675+
return *Delayed;
1085710676
}
1085810677
if (GatherShuffles.size() == 1 &&
1085910678
*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
@@ -11062,14 +10881,16 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
1106210881
IsUsedInExpr &=
1106310882
FindReusedSplat(VecMask, TEs.front()->getVectorFactor());
1106410883
ShuffleBuilder.add(*TEs.front(), VecMask);
11065-
IsNonPoisoned &=
11066-
isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
10884+
if (TEs.front()->VectorizedValue)
10885+
IsNonPoisoned &=
10886+
isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
1106710887
} else {
1106810888
IsUsedInExpr = false;
1106910889
ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
11070-
IsNonPoisoned &=
11071-
isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
11072-
isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
10890+
if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
10891+
IsNonPoisoned &=
10892+
isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
10893+
isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
1107310894
}
1107410895
}
1107510896
}
@@ -11128,7 +10949,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
1112810949
if (!all_of(GatheredScalars, PoisonValue::classof)) {
1112910950
SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
1113010951
TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
11131-
Value *BV = ShuffleBuilder.gather(GatheredScalars);
10952+
Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
1113210953
ShuffleBuilder.add(BV, BVMask);
1113310954
}
1113410955
if (all_of(NonConstants, [=](Value *V) {
@@ -11142,13 +10963,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
1114210963
E->ReuseShuffleIndices, E->Scalars.size(),
1114310964
[&](Value *&Vec, SmallVectorImpl<int> &Mask) {
1114410965
TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
11145-
Vec = ShuffleBuilder.gather(NonConstants, Vec);
10966+
Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
1114610967
});
1114710968
} else if (!allConstant(GatheredScalars)) {
1114810969
// Gather unique scalars and all constants.
1114910970
SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
1115010971
TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
11151-
Value *BV = ShuffleBuilder.gather(GatheredScalars);
10972+
Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
1115210973
ShuffleBuilder.add(BV, ReuseMask);
1115310974
Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
1115410975
} else {

0 commit comments

Comments
 (0)