Skip to content

[SLP] Fix the Vec lane overridden by the shuffle mask #106341

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 35 additions & 87 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10658,6 +10658,17 @@ static T *performExtractsShuffleAction(
return Prev;
}

namespace {
/// Data type for handling buildvector sequences with the reused scalars from
/// other tree entries.
template <typename T> struct ShuffledInsertData {
/// List of insertelements to be replaced by shuffles.
SmallVector<InsertElementInst *> InsertElements;
/// The parent vectors and shuffle mask for the given list of inserts.
MapVector<T, SmallVector<int>> ValueMasks;
};
} // namespace

InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
Expand Down Expand Up @@ -10699,8 +10710,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

SmallPtrSet<Value *, 16> ExtractCostCalculated;
InstructionCost ExtractCost = 0;
SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
SmallVector<APInt> DemandedElts;
SmallDenseSet<Value *, 4> UsedInserts;
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
Expand Down Expand Up @@ -10737,48 +10747,24 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (InsertIdx) {
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
auto *It = find_if(
FirstUsers,
[this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
ShuffledInserts,
[this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
// Checks if 2 insertelements are from the same buildvector.
InsertElementInst *VecInsert = Data.InsertElements.front();
return areTwoInsertFromSameBuildVector(
VU, cast<InsertElementInst>(Pair.first),
[this](InsertElementInst *II) -> Value * {
VU, VecInsert, [this](InsertElementInst *II) -> Value * {
Value *Op0 = II->getOperand(0);
if (getTreeEntry(II) && !getTreeEntry(Op0))
return nullptr;
return Op0;
});
});
int VecId = -1;
if (It == FirstUsers.end()) {
(void)ShuffleMasks.emplace_back();
SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
if (Mask.empty())
Mask.assign(FTy->getNumElements(), PoisonMaskElem);
// Find the insertvector, vectorized in tree, if any.
Value *Base = VU;
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
if (IEBase != EU.User &&
(!IEBase->hasOneUse() ||
getElementIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
break;
// Build the mask for the vectorized insertelement instructions.
if (const TreeEntry *E = getTreeEntry(IEBase)) {
VU = IEBase;
do {
IEBase = cast<InsertElementInst>(Base);
int Idx = *getElementIndex(IEBase);
assert(Mask[Idx] == PoisonMaskElem &&
"InsertElementInstruction used already.");
Mask[Idx] = Idx;
Base = IEBase->getOperand(0);
} while (E == getTreeEntry(Base));
break;
}
Base = cast<InsertElementInst>(Base)->getOperand(0);
}
FirstUsers.emplace_back(VU, ScalarTE);
if (It == ShuffledInserts.end()) {
auto &Data = ShuffledInserts.emplace_back();
Data.InsertElements.emplace_back(VU);
DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
VecId = FirstUsers.size() - 1;
VecId = ShuffledInserts.size() - 1;
auto It = MinBWs.find(ScalarTE);
if (It != MinBWs.end() &&
VectorCasts
Expand All @@ -10804,12 +10790,13 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
Cost += C;
}
} else {
if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
It->first = VU;
VecId = std::distance(FirstUsers.begin(), It);
if (isFirstInsertElement(VU, It->InsertElements.front()))
It->InsertElements.front() = VU;
VecId = std::distance(ShuffledInserts.begin(), It);
}
int InIdx = *InsertIdx;
SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
SmallVectorImpl<int> &Mask =
ShuffledInserts[VecId].ValueMasks[ScalarTE];
if (Mask.empty())
Mask.assign(FTy->getNumElements(), PoisonMaskElem);
Mask[InIdx] = EU.Lane;
Expand Down Expand Up @@ -10973,9 +10960,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
return std::make_pair(TE, false);
};
// Calculate the cost of the reshuffled vectors, if any.
for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
auto Vector = ShuffleMasks[I].takeVector();
for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
unsigned VF = 0;
auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
ArrayRef<const TreeEntry *> TEs) {
Expand Down Expand Up @@ -11026,7 +11013,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
[](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
EstimateShufflesCost);
InstructionCost InsertCost = TTI->getScalarizationOverhead(
cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
cast<FixedVectorType>(
ShuffledInserts[I].InsertElements.front()->getType()),
DemandedElts[I],
/*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
Cost -= InsertCost;
}
Expand Down Expand Up @@ -14107,17 +14096,6 @@ Value *BoUpSLP::vectorizeTree() {
return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
}

namespace {
/// Data type for handling buildvector sequences with the reused scalars from
/// other tree entries.
struct ShuffledInsertData {
/// List of insertelements to be replaced by shuffles.
SmallVector<InsertElementInst *> InsertElements;
/// The parent vectors and shuffle mask for the given list of inserts.
MapVector<Value *, SmallVector<int>> ValueMasks;
};
} // namespace

Value *BoUpSLP::vectorizeTree(
const ExtraValueToDebugLocsMap &ExternallyUsedValues,
SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
Expand Down Expand Up @@ -14255,7 +14233,7 @@ Value *BoUpSLP::vectorizeTree(
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
<< " values .\n");

SmallVector<ShuffledInsertData> ShuffledInserts;
SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
// Maps vector instruction to original insertelement instruction
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
// Maps extract Scalar to the corresponding extractelement instruction in the
Expand Down Expand Up @@ -14468,8 +14446,8 @@ Value *BoUpSLP::vectorizeTree(

std::optional<unsigned> InsertIdx = getElementIndex(VU);
if (InsertIdx) {
auto *It =
find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
auto *It = find_if(
ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
// Checks if 2 insertelements are from the same buildvector.
InsertElementInst *VecInsert = Data.InsertElements.front();
return areTwoInsertFromSameBuildVector(
Expand All @@ -14481,36 +14459,6 @@ Value *BoUpSLP::vectorizeTree(
(void)ShuffledInserts.emplace_back();
It = std::next(ShuffledInserts.begin(),
ShuffledInserts.size() - 1);
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
if (Mask.empty())
Mask.assign(FTy->getNumElements(), PoisonMaskElem);
// Find the insertvector, vectorized in tree, if any.
Value *Base = VU;
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
if (IEBase != User &&
(!IEBase->hasOneUse() ||
getElementIndex(IEBase).value_or(Idx) == Idx))
break;
// Build the mask for the vectorized insertelement instructions.
if (const TreeEntry *E = getTreeEntry(IEBase)) {
do {
IEBase = cast<InsertElementInst>(Base);
int IEIdx = *getElementIndex(IEBase);
assert(Mask[IEIdx] == PoisonMaskElem &&
"InsertElementInstruction used already.");
Mask[IEIdx] = IEIdx;
Base = IEBase->getOperand(0);
} while (E == getTreeEntry(Base));
break;
}
Base = cast<InsertElementInst>(Base)->getOperand(0);
// After the vectorization the def-use chain has changed, need
// to look through original insertelement instructions, if they
// get replaced by vector instructions.
auto It = VectorToInsertElement.find(Base);
if (It != VectorToInsertElement.end())
Base = It->second;
}
}
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
if (Mask.empty())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ define void @test(i8 %0, i8 %1) {
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[TMP1]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 0, i32 7
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I rather doubt it is correct. Here you're picking the first element from %TMP6 (which is %TMP0 or %0 in the original code), while it should be 0th element from %TMP2 (%li8 in the original code)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The chain of icmp instructions has its first element as %0. However, the first element of %TMP9 is the first element of %TMP8, which is %li8. %TMP9 corresponds to the chain of icmp instructions.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while it should be 0th element from %TMP2 (%li8 in the original code)

The current vectorized result is as you mentioned, but it is miscompiled.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I see what you mean, you're right

; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15>
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> zeroinitializer, [[TMP9]]
; CHECK-NEXT: ret void
Expand Down
Loading