-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP] Fix the Vec lane overridden by the shuffle mask #106341
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: tcwzxx (tcwzxx) ChangesCurrently, SLP uses shuffle for the external user of %li13 = load i8, ptr getelementptr (i8, ptr null, i32 13), align 1
%2 = load <8 x i8>, ptr getelementptr (i8, ptr null, i32 8), align 1
%li12 = load i8, ptr getelementptr (i8, ptr null, i32 12), align 1
%li11164 = load i8, ptr getelementptr (i8, ptr null, i32 11), align 1
%li10 = load i8, ptr getelementptr (i8, ptr null, i32 10), align 1
%li9163 = load i8, ptr getelementptr (i8, ptr null, i32 9), align 1
%li8 = load i8, ptr getelementptr (i8, ptr null, i32 8), align 1
%li14 = load i8, ptr getelementptr (i8, ptr null, i32 14), align 1
%li15165 = load i8, ptr getelementptr (i8, ptr null, i32 15), align 1
%lupto8237 = insertelement <16 x i8> zeroinitializer, i8 %li8, i64 8
%lupto9238 = insertelement <16 x i8> %lupto8237, i8 %li9163, i64 9
%lupto10239 = insertelement <16 x i8> %lupto9238, i8 %li10, i64 10
%lupto11240 = insertelement <16 x i8> %lupto10239, i8 %li11164, i64 11
%lupto12241 = insertelement <16 x i8> %lupto11240, i8 %li12, i64 12
%3 = shufflevector <8 x i8> %2, <8 x i8> poison, <16 x i32> <i32 7, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
%lupto132421 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %3, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 14, i32 15>
%lupto13242 = insertelement <16 x i8> %lupto12241, i8 %li13, i64 13
%lupto14243 = insertelement <16 x i8> %lupto13242, i8 %li14, i64 1
%l = insertelement <16 x i8> %lupto14243, i8 %li15165, i64 0
%li15 = extractelement <16 x i8> %l, i64 15
%4 = insertelement <16 x i8> %lupto132421, i8 %0, i32 0
%5 = insertelement <16 x i8> %4, i8 %1, i32 1
%6 = insertelement <16 x i8> %5, i8 0, i32 7
%7 = insertelement <16 x i8> %6, i8 %li9163, i32 9
%8 = shufflevector <16 x i8> %7, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15>
%9 = icmp ne <16 x i8> zeroinitializer, %8 1、 Full diff: https://github.com/llvm/llvm-project/pull/106341.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ed47ed661ab946..9f50b6181bf95d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10658,6 +10658,17 @@ static T *performExtractsShuffleAction(
return Prev;
}
+namespace {
+/// Data type for handling buildvector sequences with the reused scalars from
+/// other tree entries.
+template <typename T> struct ShuffledInsertData {
+ /// List of insertelements to be replaced by shuffles.
+ SmallVector<InsertElementInst *> InsertElements;
+ /// The parent vectors and shuffle mask for the given list of inserts.
+ MapVector<T, SmallVector<int>> ValueMasks;
+};
+} // namespace
+
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
@@ -10699,8 +10710,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallPtrSet<Value *, 16> ExtractCostCalculated;
InstructionCost ExtractCost = 0;
- SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
- SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
+ SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
SmallVector<APInt> DemandedElts;
SmallDenseSet<Value *, 4> UsedInserts;
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
@@ -10737,11 +10747,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (InsertIdx) {
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
auto *It = find_if(
- FirstUsers,
- [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
+ ShuffledInserts,
+ [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
+ // Checks if 2 insertelements are from the same buildvector.
+ InsertElementInst *VecInsert = Data.InsertElements.front();
return areTwoInsertFromSameBuildVector(
- VU, cast<InsertElementInst>(Pair.first),
- [this](InsertElementInst *II) -> Value * {
+ VU, VecInsert, [this](InsertElementInst *II) -> Value * {
Value *Op0 = II->getOperand(0);
if (getTreeEntry(II) && !getTreeEntry(Op0))
return nullptr;
@@ -10749,36 +10760,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
});
});
int VecId = -1;
- if (It == FirstUsers.end()) {
- (void)ShuffleMasks.emplace_back();
- SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
- if (Mask.empty())
- Mask.assign(FTy->getNumElements(), PoisonMaskElem);
- // Find the insertvector, vectorized in tree, if any.
- Value *Base = VU;
- while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
- if (IEBase != EU.User &&
- (!IEBase->hasOneUse() ||
- getElementIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
- break;
- // Build the mask for the vectorized insertelement instructions.
- if (const TreeEntry *E = getTreeEntry(IEBase)) {
- VU = IEBase;
- do {
- IEBase = cast<InsertElementInst>(Base);
- int Idx = *getElementIndex(IEBase);
- assert(Mask[Idx] == PoisonMaskElem &&
- "InsertElementInstruction used already.");
- Mask[Idx] = Idx;
- Base = IEBase->getOperand(0);
- } while (E == getTreeEntry(Base));
- break;
- }
- Base = cast<InsertElementInst>(Base)->getOperand(0);
- }
- FirstUsers.emplace_back(VU, ScalarTE);
+ if (It == ShuffledInserts.end()) {
+ auto &Data = ShuffledInserts.emplace_back();
+ Data.InsertElements.emplace_back(VU);
DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
- VecId = FirstUsers.size() - 1;
+ VecId = ShuffledInserts.size() - 1;
auto It = MinBWs.find(ScalarTE);
if (It != MinBWs.end() &&
VectorCasts
@@ -10804,12 +10790,13 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
Cost += C;
}
} else {
- if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
- It->first = VU;
- VecId = std::distance(FirstUsers.begin(), It);
+ if (isFirstInsertElement(VU, It->InsertElements.front()))
+ It->InsertElements.front() = VU;
+ VecId = std::distance(ShuffledInserts.begin(), It);
}
int InIdx = *InsertIdx;
- SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
+ SmallVectorImpl<int> &Mask =
+ ShuffledInserts[VecId].ValueMasks[ScalarTE];
if (Mask.empty())
Mask.assign(FTy->getNumElements(), PoisonMaskElem);
Mask[InIdx] = EU.Lane;
@@ -10973,9 +10960,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
return std::make_pair(TE, false);
};
// Calculate the cost of the reshuffled vectors, if any.
- for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
- Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
- auto Vector = ShuffleMasks[I].takeVector();
+ for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
+ Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
+ auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
unsigned VF = 0;
auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
ArrayRef<const TreeEntry *> TEs) {
@@ -11026,7 +11013,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
[](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
EstimateShufflesCost);
InstructionCost InsertCost = TTI->getScalarizationOverhead(
- cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
+ cast<FixedVectorType>(
+ ShuffledInserts[I].InsertElements.front()->getType()),
+ DemandedElts[I],
/*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
Cost -= InsertCost;
}
@@ -14107,17 +14096,6 @@ Value *BoUpSLP::vectorizeTree() {
return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
}
-namespace {
-/// Data type for handling buildvector sequences with the reused scalars from
-/// other tree entries.
-struct ShuffledInsertData {
- /// List of insertelements to be replaced by shuffles.
- SmallVector<InsertElementInst *> InsertElements;
- /// The parent vectors and shuffle mask for the given list of inserts.
- MapVector<Value *, SmallVector<int>> ValueMasks;
-};
-} // namespace
-
Value *BoUpSLP::vectorizeTree(
const ExtraValueToDebugLocsMap &ExternallyUsedValues,
SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
@@ -14255,7 +14233,7 @@ Value *BoUpSLP::vectorizeTree(
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
<< " values .\n");
- SmallVector<ShuffledInsertData> ShuffledInserts;
+ SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
// Maps vector instruction to original insertelement instruction
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
// Maps extract Scalar to the corresponding extractelement instruction in the
@@ -14468,8 +14446,8 @@ Value *BoUpSLP::vectorizeTree(
std::optional<unsigned> InsertIdx = getElementIndex(VU);
if (InsertIdx) {
- auto *It =
- find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
+ auto *It = find_if(
+ ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
// Checks if 2 insertelements are from the same buildvector.
InsertElementInst *VecInsert = Data.InsertElements.front();
return areTwoInsertFromSameBuildVector(
@@ -14481,36 +14459,6 @@ Value *BoUpSLP::vectorizeTree(
(void)ShuffledInserts.emplace_back();
It = std::next(ShuffledInserts.begin(),
ShuffledInserts.size() - 1);
- SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
- if (Mask.empty())
- Mask.assign(FTy->getNumElements(), PoisonMaskElem);
- // Find the insertvector, vectorized in tree, if any.
- Value *Base = VU;
- while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
- if (IEBase != User &&
- (!IEBase->hasOneUse() ||
- getElementIndex(IEBase).value_or(Idx) == Idx))
- break;
- // Build the mask for the vectorized insertelement instructions.
- if (const TreeEntry *E = getTreeEntry(IEBase)) {
- do {
- IEBase = cast<InsertElementInst>(Base);
- int IEIdx = *getElementIndex(IEBase);
- assert(Mask[IEIdx] == PoisonMaskElem &&
- "InsertElementInstruction used already.");
- Mask[IEIdx] = IEIdx;
- Base = IEBase->getOperand(0);
- } while (E == getTreeEntry(Base));
- break;
- }
- Base = cast<InsertElementInst>(Base)->getOperand(0);
- // After the vectorization the def-use chain has changed, need
- // to look through original insertelement instructions, if they
- // get replaced by vector instructions.
- auto It = VectorToInsertElement.find(Base);
- if (It != VectorToInsertElement.end())
- Base = It->second;
- }
}
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
if (Mask.empty())
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll
index 1d54f59f2fdd84..28afa40640bf63 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll
@@ -12,7 +12,7 @@ define void @test(i8 %0, i8 %1) {
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[TMP1]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 0, i32 7
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15>
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> zeroinitializer, [[TMP9]]
; CHECK-NEXT: ret void
|
@@ -12,7 +12,7 @@ define void @test(i8 %0, i8 %1) { | |||
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[TMP1]], i32 1 | |||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 0, i32 7 | |||
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> | |||
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15> | |||
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I rather doubt it is correct. Here you're picking the first element from %TMP6 (which is %TMP0 or %0 in the original code), while it should be 0th element from %TMP2 (%li8 in the original code)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The chain of icmp
instructions has its first element as %0
. However, the first element of %TMP9
is the first element of %TMP8
, which is %li8
. %TMP9
corresponds to the chain of icmp
instructions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
while it should be 0th element from %TMP2 (%li8 in the original code)
The current vectorized result is as you mentioned, but it is miscompiled.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I see what you mean, you're right
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/17/builds/2261 Here is the relevant piece of the build log for the reference
|
Currently, SLP uses shuffle for the external user of
InsertElementInst
and iterates through theInsertElementInst
chain to fill the mask with constant indices. However, it may override the original Vec lane. Using the original Vec lane is sufficient.1、
%l
is vectorized to%lupto132421
2、When processing
%7
, it will build a mask that will overiide%4
and%5