-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Initial support for (masked)loads + compress and (masked)interleaved #132099
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Initial support for (masked)loads + compress and (masked)interleaved #132099
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Alexey Bataev (alexey-bataev) ChangesAdded initial support for (masked)loads + compress and Patch is 109.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132099.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1d9d80bd69def..f9905cc7c3307 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -38,6 +38,7 @@
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
@@ -1378,7 +1379,8 @@ class BoUpSLP {
Gather,
Vectorize,
ScatterVectorize,
- StridedVectorize
+ StridedVectorize,
+ MaskedLoadCompressVectorize
};
using ValueList = SmallVector<Value *, 8>;
@@ -3378,6 +3380,7 @@ class BoUpSLP {
Vectorize, ///< The node is regularly vectorized.
ScatterVectorize, ///< Masked scatter/gather node.
StridedVectorize, ///< Strided loads (and stores)
+ MaskedLoadCompressVectorize, ///< Masked load with compress.
NeedToGather, ///< Gather/buildvector node.
CombinedVectorize, ///< Vectorized node, combined with its user into more
///< complex node like select/cmp to minmax, mul/add to
@@ -3604,6 +3607,9 @@ class BoUpSLP {
case StridedVectorize:
dbgs() << "StridedVectorize\n";
break;
+ case MaskedLoadCompressVectorize:
+ dbgs() << "MaskedLoadCompressVectorize\n";
+ break;
case NeedToGather:
dbgs() << "NeedToGather\n";
break;
@@ -4650,7 +4656,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
if (Entry->isGather())
return "color=red";
if (Entry->State == TreeEntry::ScatterVectorize ||
- Entry->State == TreeEntry::StridedVectorize)
+ Entry->State == TreeEntry::StridedVectorize ||
+ Entry->State == TreeEntry::MaskedLoadCompressVectorize)
return "color=blue";
return "";
}
@@ -5214,6 +5221,145 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
return Builder.CreateShuffleVector(Vec, Mask);
}
+/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
+/// with \p Order.
+static void buildCompressMask(ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, Type *ScalarTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<int> &CompressMask) {
+ const unsigned Sz = PointerOps.size();
+ CompressMask.assign(Sz, PoisonMaskElem);
+ // The first element always set.
+ CompressMask[0] = 0;
+ Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
+ for (unsigned I : seq<unsigned>(1, Sz)) {
+ Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
+ unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+ CompressMask[I] = Pos;
+ }
+}
+
+/// Checks if the \p VL can be transformed to a (masked)load + compress or
+/// (masked) interleaved load.
+static bool isMaskedLoadCompress(
+ ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
+ const DominatorTree &DT, const TargetLibraryInfo &TLI,
+ const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
+ unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
+ VectorType *&LoadVecTy) {
+ InterleaveFactor = 0;
+ Type *ScalarTy = VL.front()->getType();
+ const unsigned Sz = VL.size();
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ // Check external uses.
+ for (const auto [I, V] : enumerate(VL)) {
+ if (AreAllUsersVectorized(V))
+ continue;
+ InstructionCost ExtractCost =
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I);
+ InstructionCost ScalarCost =
+ TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+ if (ExtractCost <= ScalarCost)
+ return false;
+ }
+ Value *Ptr0;
+ Value *PtrN;
+ if (Order.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[Order.front()];
+ PtrN = PointerOps[Order.back()];
+ }
+ std::optional<int> Diff =
+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+ if (!Diff)
+ return false;
+ const unsigned MaxRegSize =
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedValue();
+ // Check for very large distances between elements.
+ if (*Diff / Sz >= MaxRegSize / 8)
+ return false;
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
+ LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
+ auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
+ bool IsNotMasked = isSafeToLoadUnconditionally(
+ Ptr0, LoadVecTy, CommonAlignment, DL,
+ cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
+ &TLI);
+ // TODO: perform the analysis of each scalar load for better
+ // safe-load-unconditionally analysis.
+ buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
+ assert(CompressMask.size() >= 2 && "At least two elements are required");
+ IsMasked = !IsNotMasked;
+ auto [ScalarGEPCost, VectorGEPCost] =
+ getGEPCosts(TTI, PointerOps, PointerOps.front(),
+ Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
+ // The cost of scalar loads.
+ InstructionCost ScalarLoadsCost =
+ std::accumulate(VL.begin(), VL.end(), InstructionCost(),
+ [&](InstructionCost C, Value *V) {
+ return C + TTI.getInstructionCost(cast<Instruction>(V),
+ CostKind);
+ }) +
+ ScalarGEPCost;
+ APInt DemandedElts = APInt::getAllOnes(Sz);
+ InstructionCost GatherCost =
+ getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+ /*Insert=*/true,
+ /*Extract=*/false, CostKind) +
+ ScalarLoadsCost;
+ InstructionCost LoadCost = 0;
+ if (IsNotMasked)
+ LoadCost =
+ TTI.getMemoryOpCost(Instruction::Load, LoadVecTy,
+ IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind);
+ else
+ LoadCost =
+ TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind);
+ SmallVector<int> Mask;
+ if (!Order.empty())
+ inversePermutation(Order, Mask);
+ if (int Interval = CompressMask[1] - CompressMask[0];
+ Interval > 0 && all_of(enumerate(CompressMask), [&](const auto &D) {
+ return static_cast<unsigned>(D.value()) == D.index() * Interval;
+ })) {
+ // Check for potential segmented(interleaved) loads.
+ if (TTI.isLegalInterleavedAccessType(
+ LoadVecTy, Interval, IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace())) {
+ InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost(
+ Instruction::Load, LoadVecTy, Interval, std::nullopt,
+ IsNotMasked ? LI->getAlign() : CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind, !IsNotMasked);
+ if (!Mask.empty())
+ InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
+ LoadVecTy, CompressMask, CostKind);
+ if (InterleavedCost < GatherCost) {
+ InterleaveFactor = Interval;
+ return true;
+ }
+ }
+ }
+ if (!Order.empty()) {
+ SmallVector<int> NewMask(Sz, PoisonMaskElem);
+ for (unsigned I : seq<unsigned>(Sz)) {
+ NewMask[I] = CompressMask[Mask[I]];
+ }
+ CompressMask.swap(NewMask);
+ }
+ InstructionCost CompressCost = ::getShuffleCost(
+ TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
+ InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
+ return TotalVecCost < GatherCost;
+}
+
BoUpSLP::LoadsState
BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
SmallVectorImpl<unsigned> &Order,
@@ -5285,9 +5431,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
// Check that the sorted loads are consecutive.
if (static_cast<unsigned>(*Diff) == Sz - 1)
return LoadsState::Vectorize;
- if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
- TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
- return LoadsState::Gather;
// Simple check if not a strided access - clear order.
bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
// Try to generate strided load node if:
@@ -5343,7 +5486,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
}
}
}
+ [[maybe_unused]] bool IsMasked;
+ [[maybe_unused]] unsigned InterleaveFactor;
+ [[maybe_unused]] SmallVector<int> CompressMask;
+ [[maybe_unused]] VectorType *LoadVecTy;;
+ if (isMaskedLoadCompress(
+ VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI,
+ [&](Value *V) {
+ return areAllUsersVectorized(cast<Instruction>(V),
+ UserIgnoreList);
+ },
+ IsMasked, InterleaveFactor, CompressMask, LoadVecTy))
+ return LoadsState::MaskedLoadCompressVectorize;
}
+ if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
+ TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
+ return LoadsState::Gather;
// Correctly identify compare the cost of loads + shuffles rather than
// strided/masked gather loads. Returns true if vectorized + shuffles
// representation is better than just gather.
@@ -5436,7 +5594,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
}
// If need the reorder - consider as high-cost masked gather for now.
if ((LS == LoadsState::Vectorize ||
- LS == LoadsState::StridedVectorize) &&
+ LS == LoadsState::StridedVectorize ||
+ LS == LoadsState::MaskedLoadCompressVectorize) &&
!Order.empty() && !isReverseOrder(Order))
LS = LoadsState::ScatterVectorize;
States.push_back(LS);
@@ -5501,6 +5660,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
CommonAlignment, CostKind) +
VectorGEPCost;
break;
+ case LoadsState::MaskedLoadCompressVectorize:
+ VecLdCost += TTI.getMaskedMemoryOpCost(
+ Instruction::Load, SubVecTy, CommonAlignment,
+ LI0->getPointerAddressSpace(), CostKind) +
+ VectorGEPCost +
+ ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy,
+ {}, CostKind);
+ break;
case LoadsState::ScatterVectorize:
VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
LI0->getPointerOperand(),
@@ -5874,7 +6041,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
return std::nullopt;
if (TE.State == TreeEntry::SplitVectorize ||
((TE.State == TreeEntry::Vectorize ||
- TE.State == TreeEntry::StridedVectorize) &&
+ TE.State == TreeEntry::StridedVectorize ||
+ TE.State == TreeEntry::MaskedLoadCompressVectorize) &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
@@ -6061,7 +6229,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
OrdersType CurrentOrder;
LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
CurrentOrder, PointerOps);
- if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize)
+ if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
+ Res == LoadsState::MaskedLoadCompressVectorize)
return std::move(CurrentOrder);
}
// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
@@ -6301,7 +6470,8 @@ void BoUpSLP::reorderTopToBottom() {
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
- TE->State == TreeEntry::SplitVectorize) ||
+ TE->State == TreeEntry::SplitVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
if (TE->State == TreeEntry::Vectorize &&
@@ -6478,7 +6648,8 @@ void BoUpSLP::reorderTopToBottom() {
if ((TE->State == TreeEntry::SplitVectorize &&
TE->ReuseShuffleIndices.empty()) ||
((TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) &&
+ TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize) &&
(isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
InsertElementInst>(TE->getMainOp()) ||
(SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
@@ -6526,6 +6697,8 @@ bool BoUpSLP::canReorderOperands(
return OpData.first == I &&
(OpData.second->State == TreeEntry::Vectorize ||
OpData.second->State == TreeEntry::StridedVectorize ||
+ OpData.second->State ==
+ TreeEntry::MaskedLoadCompressVectorize ||
OpData.second->State == TreeEntry::SplitVectorize);
}))
continue;
@@ -6540,6 +6713,7 @@ bool BoUpSLP::canReorderOperands(
// node, just reorder reuses mask.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
GatherOps.push_back(TE);
@@ -6550,6 +6724,7 @@ bool BoUpSLP::canReorderOperands(
[&Gather, UserTE, I](TreeEntry *TE) {
assert(TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
"Only non-vectorized nodes are expected.");
if (TE->UserTreeIndex.UserTE == UserTE &&
@@ -6586,6 +6761,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize)
NonVectorized.push_back(TE.get());
if (std::optional<OrdersType> CurrentOrder =
@@ -6593,6 +6769,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Queue.push(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize ||
TE->State == TreeEntry::SplitVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.insert(TE.get());
@@ -6621,6 +6798,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
for (TreeEntry *TE : OrderedOps) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
+ TE->State == TreeEntry::MaskedLoadCompressVectorize ||
TE->State == TreeEntry::SplitVectorize ||
(TE->isGather() && GathersToOrders.contains(TE))) ||
!TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
@@ -6918,6 +7096,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Gathers are processed separately.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
+ TE->State != TreeEntry::MaskedLoadCompressVectorize &&
TE->State != TreeEntry::SplitVectorize &&
(TE->State != TreeEntry::ScatterVectorize ||
TE->ReorderIndices.empty()))
@@ -6950,7 +7129,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
Data.first->reorderOperands(Mask);
if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
Data.first->isAltShuffle() ||
- Data.first->State == TreeEntry::StridedVectorize) {
+ Data.first->State == TreeEntry::StridedVectorize ||
+ Data.first->State == TreeEntry::MaskedLoadCompressVectorize) {
reorderScalars(Data.first->Scalars, Mask);
reorderOrder(Data.first->ReorderIndices, MaskOrder,
/*BottomOrder=*/true);
@@ -7722,22 +7902,31 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// just exit.
unsigned ConsecutiveNodesSize = 0;
if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
- any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
- [&, Slice = Slice](const auto &P) {
- const auto *It = find_if(Slice, [&](Value *V) {
- return std::get<1>(P).contains(V);
- });
- if (It == Slice.end())
- return false;
- ArrayRef<Value *> VL =
- VectorizableTree[std::get<0>(P)]->Scalars;
- ConsecutiveNodesSize += VL.size();
- unsigned Start = std::distance(Slice.begin(), It);
- unsigned Sz = Slice.size() - Start;
- return Sz < VL.size() ||
- Slice.slice(std::distance(Slice.begin(), It),
- VL.size()) != VL;
- }))
+ any_of(
+ zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&, Slice = Slice](const auto &P) {
+ const auto *It = find_if(Slice, [&](Value *V) {
+ return std::get<1>(P).contains(V);
+ });
+ if (It == Slice.end())
+ return false;
+ const TreeEntry &TE = *VectorizableTree[std::get<0>(P)];
+ ArrayRef<Value *> VL = TE.Scalars;
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ LoadsState State =
+ canVectorizeLoads(VL, VL.front(), Order,
+ PointerOps);
+ if (State == LoadsState::ScatterVectorize||
+ State == LoadsState::MaskedLoadCompressVectorize)
+ return false;
+ ConsecutiveNodesSize += VL.size();
+ unsigned Start = std::distance(Slice.begin(), It);
+ unsigned Sz = Slice.size() - Start;
+ return Sz < VL.size() ||
+ Slice.slice(std::distance(Slice.begin(), It),
+ VL.size()) != VL;
+ }))
continue;
// Try to build long masked gather loads.
UserMaxVF = bit_ceil(UserMaxVF);
@@ -8216,6 +8405,13 @@ BoUpSLP::TreeEntr...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
Supposed to fix 130872, but fixes only for AVX512 currently. Requires better pointer analysis to avoid masked loads |
@@ -5214,6 +5221,145 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec, | |||
return Builder.CreateShuffleVector(Vec, Mask); | |||
} | |||
|
|||
/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered | |||
/// with \p Order. | |||
static void buildCompressMask(ArrayRef<Value *> PointerOps, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth returning true (or std::optional) here if the compression stride is uniform?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if this is possible at all. Can you share a scenario?
Created using spr 1.3.5
Ping! |
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better comments describing what's going on?
Created using spr 1.3.5
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
Created using spr 1.3.5
…ked)interleaved Added initial support for (masked)loads + compress and (masked)interleaved loads. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: llvm/llvm-project#132099
…aved Added initial support for (masked)loads + compress and (masked)interleaved loads. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: #132099
…ked)interleaved Added initial support for (masked)loads + compress and (masked)interleaved loads. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: llvm/llvm-project#132099
Hi! Could this be the cause of clang-aarch64-sve-vls-2stage failure ? https://lab.llvm.org/buildbot/#/builders/4/builds/6023
|
Maybe, could you try to provide a reproducer? I don't have the access to arm-based machines and cannot reproduce it locally |
sure, I'll try to do that |
Thanks!!! |
We also hit this crash since this commit. I've put a reproducer in #134411 |
…aved Added initial support for (masked)loads + compress and (masked)interleaved loads. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: #132099
…ked)interleaved Added initial support for (masked)loads + compress and (masked)interleaved loads. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: llvm/llvm-project#132099
Hi. Probably useless now but here it is:
|
I hope it is fixed already, but thanks anyway |
We had failures starting from this PR https://lab.llvm.org/buildbot/#/builders/143/builds/6668 which have improved to the point where now only oggenc is failing https://lab.llvm.org/buildbot/#/builders/143/builds/6821. What's the status here? That last build was a couple of hours ago so I assume there are not fixes in flight. I doubt I can revert this cleanly at this point anyway. I will get the reproducer off of the machine for you. |
Reproducer: oggenc-6b745c.zip The logs are a mess due to parallel builds, this is what I could recover from them:
|
This did not fail on SVE VLA (vector length agnostic aka scalable vectors), but did on VLS (vector length specific aka fixed size vectors). |
Thanks, will fix ASAP |
…rectly identify profitability Need to reorder properly the scalars, when evaluating the costs for the external uses/geps to prevent differences in the calculating of the profitability costs, used to choose between gather/compressed loads. Fixes #132099 (comment)
Must be fixed in 076318b |
…acts to correctly identify profitability Need to reorder properly the scalars, when evaluating the costs for the external uses/geps to prevent differences in the calculating of the profitability costs, used to choose between gather/compressed loads. Fixes llvm/llvm-project#132099 (comment)
…rectly identify profitability Need to reorder properly the scalars, when evaluating the costs for the external uses/geps to prevent differences in the calculating of the profitability costs, used to choose between gather/compressed loads. Fixes llvm#132099 (comment)
It is fixed (https://lab.llvm.org/buildbot/#/builders/143/builds/6827), thanks! |
There's another problem after this commit, which isn't fixed by 076318b. See #134578 (comment) |
I managed to find a reproducer that's not as problematic as the one I mentioned in the previous comment. https://gcc.godbolt.org/z/cT75Pb4Ea
The assertion failure happens elsewhere, but the issue is triggered by this commit. |
#135821 |
Thanks for proper routing! Unfortunately, this is a distinct problem from the one reported in #134578 (comment). I'm still trying to figure out, what's happening, and I don't have any good ideas. In the meantime I reduced the IR that crashes the Clang built with the instrumented profile gathered from Clang running on a number of our internal compilations:
(also here: https://gcc.godbolt.org/z/5n4MEEosn) Any other clang builds I tried (with asan, msan, assertions) don't crash on this IR though. |
I'll try to debug what's going on here |
I found that placing a
(if I place it after that line, Clang crashes) |
FWIW, it looks like rewriting this as a loop would make the code slightly simpler:
or
And it also makes the crash disappear, though it may be due to the issue being fragile. |
Hmm, semantically these 2 pieces are equal. Maybe a bug in the loop vectorizer, that causes memory pollution on data rewriting? Did you compile clang itself with another clang? If so, what is the version? |
This issue reproduces when bootstrapping clang from this commit (19aec00) to the current head. I.e. if this is a miscompilation, then it should be present in the whole range starting from this commit. I haven't tried to compile clang using earlier clang versions. |
Hmm, tried to investigate the reproducer, it does not even trigger the modified compiler code, so probably it just adjusts the code layout and prevents memory corruption. |
…rectly identify profitability Need to reorder properly the scalars, when evaluating the costs for the external uses/geps to prevent differences in the calculating of the profitability costs, used to choose between gather/compressed loads. Fixes llvm#132099 (comment)
Hi, Thank you for your hard work but I have to tell you that there is another assertion failure. Using test.ll.txt, $ opt -S -passes=slp-vectorizer test.ll
opt: /work/home/tkawa/src/llvm-main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:17983: llvm::Value* llvm::slpvectorizer::BoUpSLP::vectorizeTree(TreeEntry*): Assertion `IsVectorized && "Expected to be vectorized"' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0. Program arguments: opt -S -passes=slp-vectorizer test.ll
1. Running pass "function(slp-vectorizer)" on module "test.ll"
2. Running pass "slp-vectorizer" on function "_QQmain"
#0 0x0000ffff9d5fcee8 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (llvm-dev/lib/libLLVMSupport.so.21.0git+0x25cee8)
#1 0x0000ffff9d5fa5d0 llvm::sys::RunSignalHandlers() (llvm-dev/lib/libLLVMSupport.so.21.0git+0x25a5d0)
#2 0x0000ffff9d5fa738 SignalHandler(int, siginfo_t*, void*) Signals.cpp:0:0
#3 0x0000ffff9d71b7f0 (linux-vdso.so.1+0x7f0)
#4 0x0000ffff9d267608 (/lib/aarch64-linux-gnu/libc.so.6+0x87608)
#5 0x0000ffff9d21cb3c raise (/lib/aarch64-linux-gnu/libc.so.6+0x3cb3c)
#6 0x0000ffff9d207e00 abort (/lib/aarch64-linux-gnu/libc.so.6+0x27e00)
#7 0x0000ffff9d215cc0 (/lib/aarch64-linux-gnu/libc.so.6+0x35cc0)
#8 0x0000ffff9d215d30 __assert_perror_fail (/lib/aarch64-linux-gnu/libc.so.6+0x35d30)
#9 0x0000ffff99e50dd4 llvm::slpvectorizer::BoUpSLP::vectorizeTree(llvm::slpvectorizer::BoUpSLP::TreeEntry*) (llvm-dev/lib/libLLVMVectorize.so.21.0git+0x2e0dd4)
#10 0x0000ffff99e71b80 llvm::slpvectorizer::BoUpSLP::vectorizeTree(llvm::SmallDenseSet<llvm::Value*, 4u, llvm::DenseMapInfo<llvm::Value*, void>> const&, llvm::Instruction*, llvm::ArrayRef<std::tuple<llvm::Value*, unsigned int, bool>>) (llvm-dev/lib/libLLVMVectorize.so.21.0git+0x301b80)
#11 0x0000ffff99e73db8 llvm::slpvectorizer::BoUpSLP::vectorizeTree() (llvm-dev/lib/libLLVMVectorize.so.21.0git+0x303db8)
#12 0x0000ffff99e838d0 llvm::SLPVectorizerPass::vectorizeStoreChain(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP&, unsigned int, unsigned int, unsigned int&) (llvm-dev/lib/libLLVMVectorize.so.21.0git+0x3138d0)
#13 0x0000ffff99e860b8 llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::'lambda'(std::map<int, unsigned int, std::less<int>, std::allocator<std::pair<int const, unsigned int>>> const&)::operator()(std::map<int, unsigned int, std::less<int>, std::allocator<std::pair<int const, unsigned int>>> const&) const SLPVectorizer.cpp:0:0
#14 0x0000ffff99e86610 llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&) (llvm-dev/lib/libLLVMVectorize.so.21.0git+0x316610)
#15 0x0000ffff99e87804 llvm::SLPVectorizerPass::vectorizeStoreChains(llvm::slpvectorizer::BoUpSLP&) (llvm-dev/lib/libLLVMVectorize.so.21.0git+0x317804)
#16 0x0000ffff99e88984 llvm::SLPVectorizerPass::runImpl(llvm::Function&, llvm::ScalarEvolution*, llvm::TargetTransformInfo*, llvm::TargetLibraryInfo*, llvm::AAResults*, llvm::LoopInfo*, llvm::DominatorTree*, llvm::AssumptionCache*, llvm::DemandedBits*, llvm::OptimizationRemarkEmitter*) (llvm-dev/lib/libLLVMVectorize.so.21.0git+0x318984)
#17 0x0000ffff99e894e0 llvm::SLPVectorizerPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (llvm-dev/lib/libLLVMVectorize.so.21.0git+0x3194e0)
#18 0x0000ffff9b835720 llvm::detail::PassModel<llvm::Function, llvm::SLPVectorizerPass, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) PassBuilder.cpp:0:0
#19 0x0000ffff98f13850 llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (llvm-dev/lib/libLLVMCore.so.21.0git+0x4f3850)
#20 0x0000ffff9cb6e7f0 llvm::detail::PassModel<llvm::Function, llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) X86CodeGenPassBuilder.cpp:0:0
#21 0x0000ffff98f12340 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (llvm-dev/lib/libLLVMCore.so.21.0git+0x4f2340)
#22 0x0000ffff9cb6f590 llvm::detail::PassModel<llvm::Module, llvm::ModuleToFunctionPassAdaptor, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) X86CodeGenPassBuilder.cpp:0:0
#23 0x0000ffff98f128ec llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (llvm-dev/lib/libLLVMCore.so.21.0git+0x4f28ec)
#24 0x0000ffff9d6a563c llvm::runPassPipeline(llvm::StringRef, llvm::Module&, llvm::TargetMachine*, llvm::TargetLibraryInfoImpl*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::StringRef, llvm::ArrayRef<llvm::PassPlugin>, llvm::ArrayRef<std::function<void (llvm::PassBuilder&)>>, llvm::opt_tool::OutputKind, llvm::opt_tool::VerifierKind, bool, bool, bool, bool, bool, bool, bool) (llvm-dev/lib/libLLVMOptDriver.so.21.0git+0x3563c)
#25 0x0000ffff9d6b17e4 optMain (llvm-dev/lib/libLLVMOptDriver.so.21.0git+0x417e4)
#26 0x0000ffff9d2084c4 (/lib/aarch64-linux-gnu/libc.so.6+0x284c4)
#27 0x0000ffff9d208598 __libc_start_main (/lib/aarch64-linux-gnu/libc.so.6+0x28598)
#28 0x0000aaaab63808b0 _start (llvm-dev/bin/opt+0x108b0) This IR was produced by flang -S -O2 test.f90 -mllvm -print-before=slp-vectorizer -mllvm -print-module-scope and the program main
character(len=2),dimension(2) :: x1,x2
x1 = '12'
x2 = '34'
print *, [ character(len=1) :: [ character(len=2) :: (x1, x2, kk = 1, 2) ] ]
end program I confirmed this is a regrssion of this PR (0bec0f5) and the latest If you prefer a new GitHub issue, I'll create it. If you need more information, let me know. Thanks. |
Fixed in 1fcf78d |
No, it only affects the crash when building with the same profile. I can still reproduce the issue after #136146, if I gather a new instrumented profile and build Clang with. And indeed, this looks like a miscompile by another Clang optimization. Neither |
To close the loop: the details of the problem are in #138208 (unrelated to SLPVectorizer). |
Added initial support for (masked)loads + compress and
(masked)interleaved loads.