Skip to content

Commit 121fb2c

Browse files
authored
[SLP] Fix the Vec lane overridden by the shuffle mask (#106341)
Currently, SLP uses shuffle for the external user of `InsertElementInst` and iterates through the `InsertElementInst` chain to fill the mask with constant indices. However, it may override the original Vec lane. Using the original Vec lane is sufficient.
1 parent 87c86aa commit 121fb2c

File tree

2 files changed

+36
-88
lines changed

2 files changed

+36
-88
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 35 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -10653,6 +10653,17 @@ static T *performExtractsShuffleAction(
1065310653
return Prev;
1065410654
}
1065510655

10656+
namespace {
10657+
/// Data type for handling buildvector sequences with the reused scalars from
10658+
/// other tree entries.
10659+
template <typename T> struct ShuffledInsertData {
10660+
/// List of insertelements to be replaced by shuffles.
10661+
SmallVector<InsertElementInst *> InsertElements;
10662+
/// The parent vectors and shuffle mask for the given list of inserts.
10663+
MapVector<T, SmallVector<int>> ValueMasks;
10664+
};
10665+
} // namespace
10666+
1065610667
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1065710668
InstructionCost Cost = 0;
1065810669
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
@@ -10694,8 +10705,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1069410705

1069510706
SmallPtrSet<Value *, 16> ExtractCostCalculated;
1069610707
InstructionCost ExtractCost = 0;
10697-
SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
10698-
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
10708+
SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
1069910709
SmallVector<APInt> DemandedElts;
1070010710
SmallDenseSet<Value *, 4> UsedInserts;
1070110711
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
@@ -10732,48 +10742,24 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1073210742
if (InsertIdx) {
1073310743
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
1073410744
auto *It = find_if(
10735-
FirstUsers,
10736-
[this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10745+
ShuffledInserts,
10746+
[this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
10747+
// Checks if 2 insertelements are from the same buildvector.
10748+
InsertElementInst *VecInsert = Data.InsertElements.front();
1073710749
return areTwoInsertFromSameBuildVector(
10738-
VU, cast<InsertElementInst>(Pair.first),
10739-
[this](InsertElementInst *II) -> Value * {
10750+
VU, VecInsert, [this](InsertElementInst *II) -> Value * {
1074010751
Value *Op0 = II->getOperand(0);
1074110752
if (getTreeEntry(II) && !getTreeEntry(Op0))
1074210753
return nullptr;
1074310754
return Op0;
1074410755
});
1074510756
});
1074610757
int VecId = -1;
10747-
if (It == FirstUsers.end()) {
10748-
(void)ShuffleMasks.emplace_back();
10749-
SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10750-
if (Mask.empty())
10751-
Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10752-
// Find the insertvector, vectorized in tree, if any.
10753-
Value *Base = VU;
10754-
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10755-
if (IEBase != EU.User &&
10756-
(!IEBase->hasOneUse() ||
10757-
getElementIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10758-
break;
10759-
// Build the mask for the vectorized insertelement instructions.
10760-
if (const TreeEntry *E = getTreeEntry(IEBase)) {
10761-
VU = IEBase;
10762-
do {
10763-
IEBase = cast<InsertElementInst>(Base);
10764-
int Idx = *getElementIndex(IEBase);
10765-
assert(Mask[Idx] == PoisonMaskElem &&
10766-
"InsertElementInstruction used already.");
10767-
Mask[Idx] = Idx;
10768-
Base = IEBase->getOperand(0);
10769-
} while (E == getTreeEntry(Base));
10770-
break;
10771-
}
10772-
Base = cast<InsertElementInst>(Base)->getOperand(0);
10773-
}
10774-
FirstUsers.emplace_back(VU, ScalarTE);
10758+
if (It == ShuffledInserts.end()) {
10759+
auto &Data = ShuffledInserts.emplace_back();
10760+
Data.InsertElements.emplace_back(VU);
1077510761
DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10776-
VecId = FirstUsers.size() - 1;
10762+
VecId = ShuffledInserts.size() - 1;
1077710763
auto It = MinBWs.find(ScalarTE);
1077810764
if (It != MinBWs.end() &&
1077910765
VectorCasts
@@ -10799,12 +10785,13 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1079910785
Cost += C;
1080010786
}
1080110787
} else {
10802-
if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10803-
It->first = VU;
10804-
VecId = std::distance(FirstUsers.begin(), It);
10788+
if (isFirstInsertElement(VU, It->InsertElements.front()))
10789+
It->InsertElements.front() = VU;
10790+
VecId = std::distance(ShuffledInserts.begin(), It);
1080510791
}
1080610792
int InIdx = *InsertIdx;
10807-
SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10793+
SmallVectorImpl<int> &Mask =
10794+
ShuffledInserts[VecId].ValueMasks[ScalarTE];
1080810795
if (Mask.empty())
1080910796
Mask.assign(FTy->getNumElements(), PoisonMaskElem);
1081010797
Mask[InIdx] = EU.Lane;
@@ -10978,9 +10965,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1097810965
return std::make_pair(TE, false);
1097910966
};
1098010967
// Calculate the cost of the reshuffled vectors, if any.
10981-
for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10982-
Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10983-
auto Vector = ShuffleMasks[I].takeVector();
10968+
for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
10969+
Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
10970+
auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
1098410971
unsigned VF = 0;
1098510972
auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
1098610973
ArrayRef<const TreeEntry *> TEs) {
@@ -11031,7 +11018,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1103111018
[](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
1103211019
EstimateShufflesCost);
1103311020
InstructionCost InsertCost = TTI->getScalarizationOverhead(
11034-
cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
11021+
cast<FixedVectorType>(
11022+
ShuffledInserts[I].InsertElements.front()->getType()),
11023+
DemandedElts[I],
1103511024
/*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
1103611025
Cost -= InsertCost;
1103711026
}
@@ -14131,17 +14120,6 @@ Value *BoUpSLP::vectorizeTree() {
1413114120
return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
1413214121
}
1413314122

14134-
namespace {
14135-
/// Data type for handling buildvector sequences with the reused scalars from
14136-
/// other tree entries.
14137-
struct ShuffledInsertData {
14138-
/// List of insertelements to be replaced by shuffles.
14139-
SmallVector<InsertElementInst *> InsertElements;
14140-
/// The parent vectors and shuffle mask for the given list of inserts.
14141-
MapVector<Value *, SmallVector<int>> ValueMasks;
14142-
};
14143-
} // namespace
14144-
1414514123
Value *BoUpSLP::vectorizeTree(
1414614124
const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1414714125
SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
@@ -14279,7 +14257,7 @@ Value *BoUpSLP::vectorizeTree(
1427914257
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
1428014258
<< " values .\n");
1428114259

14282-
SmallVector<ShuffledInsertData> ShuffledInserts;
14260+
SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
1428314261
// Maps vector instruction to original insertelement instruction
1428414262
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
1428514263
// Maps extract Scalar to the corresponding extractelement instruction in the
@@ -14492,8 +14470,8 @@ Value *BoUpSLP::vectorizeTree(
1449214470

1449314471
std::optional<unsigned> InsertIdx = getElementIndex(VU);
1449414472
if (InsertIdx) {
14495-
auto *It =
14496-
find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
14473+
auto *It = find_if(
14474+
ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
1449714475
// Checks if 2 insertelements are from the same buildvector.
1449814476
InsertElementInst *VecInsert = Data.InsertElements.front();
1449914477
return areTwoInsertFromSameBuildVector(
@@ -14505,36 +14483,6 @@ Value *BoUpSLP::vectorizeTree(
1450514483
(void)ShuffledInserts.emplace_back();
1450614484
It = std::next(ShuffledInserts.begin(),
1450714485
ShuffledInserts.size() - 1);
14508-
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14509-
if (Mask.empty())
14510-
Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14511-
// Find the insertvector, vectorized in tree, if any.
14512-
Value *Base = VU;
14513-
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
14514-
if (IEBase != User &&
14515-
(!IEBase->hasOneUse() ||
14516-
getElementIndex(IEBase).value_or(Idx) == Idx))
14517-
break;
14518-
// Build the mask for the vectorized insertelement instructions.
14519-
if (const TreeEntry *E = getTreeEntry(IEBase)) {
14520-
do {
14521-
IEBase = cast<InsertElementInst>(Base);
14522-
int IEIdx = *getElementIndex(IEBase);
14523-
assert(Mask[IEIdx] == PoisonMaskElem &&
14524-
"InsertElementInstruction used already.");
14525-
Mask[IEIdx] = IEIdx;
14526-
Base = IEBase->getOperand(0);
14527-
} while (E == getTreeEntry(Base));
14528-
break;
14529-
}
14530-
Base = cast<InsertElementInst>(Base)->getOperand(0);
14531-
// After the vectorization the def-use chain has changed, need
14532-
// to look through original insertelement instructions, if they
14533-
// get replaced by vector instructions.
14534-
auto It = VectorToInsertElement.find(Base);
14535-
if (It != VectorToInsertElement.end())
14536-
Base = It->second;
14537-
}
1453814486
}
1453914487
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
1454014488
if (Mask.empty())

llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ define void @test(i8 %0, i8 %1) {
1212
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[TMP1]], i32 1
1313
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 0, i32 7
1414
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
15-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15>
15+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1616
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15>
1717
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> zeroinitializer, [[TMP9]]
1818
; CHECK-NEXT: ret void

0 commit comments

Comments
 (0)