-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Represent SLP graph as a tree #126771
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Represent SLP graph as a tree #126771
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-systemz Author: Alexey Bataev (alexey-bataev) ChangesWe can stop using a graph representation of the SLP structure and switch AVX512, -O3+LTO ASCI_Purple/SMG2000 - extra code vectorized, small variations RISCV, -O3+LTO gcc-c-torture/execute/GCC-C-execute-pr28982b - better vector code Patch is 89.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126771.diff 17 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e1c08077126db..6f9c62c6218b2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1409,12 +1409,6 @@ class BoUpSLP {
/// Construct a vectorizable tree that starts at \p Roots.
void buildTree(ArrayRef<Value *> Roots);
- /// Returns whether the root node has in-tree uses.
- bool doesRootHaveInTreeUses() const {
- return !VectorizableTree.empty() &&
- !VectorizableTree.front()->UserTreeIndices.empty();
- }
-
/// Return the scalars of the root node.
ArrayRef<Value *> getRootNodeScalars() const {
assert(!VectorizableTree.empty() && "No graph to get the first node from");
@@ -1524,7 +1518,12 @@ class BoUpSLP {
/// shuffled vector entry + (possibly) permutation with other gathers. It
/// implements the checks only for possibly ordered scalars (Loads,
/// ExtractElement, ExtractValue), which can be part of the graph.
- std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
+ /// \param TopToBottom If true, used for the whole tree rotation, false - for
+ /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
+ /// node might be ignored.
+ std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
+ bool TopToBottom,
+ bool IgnoreReorder);
/// Sort loads into increasing pointers offsets to allow greater clustering.
std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
@@ -1536,8 +1535,14 @@ class BoUpSLP {
/// identity order is important, or the actual order.
/// \param TopToBottom If true, include the order of vectorized stores and
/// insertelement nodes, otherwise skip them.
- std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
- bool TopToBottom);
+ /// \param IgnoreReorder true, if the root node order can be ignored.
+ std::optional<OrdersType>
+ getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
+
+ /// Checks if it is profitable to reorder the current tree.
+ /// If the tree does not contain many profitable reordable nodes, better to
+ /// skip it to save compile time.
+ bool isProfitableToReorder() const;
/// Reorders the current graph to the most profitable order starting from the
/// root node to the leaf nodes. The best order is chosen only from the nodes
@@ -1680,6 +1685,8 @@ class BoUpSLP {
bool operator == (const EdgeInfo &Other) const {
return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
}
+
+ operator bool() const { return UserTE != nullptr; }
};
/// A helper class used for scoring candidates for two consecutive lanes.
@@ -2999,8 +3006,10 @@ class BoUpSLP {
ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
TreeEntry *TE = nullptr;
const auto *It = find_if(VL, [&](Value *V) {
+ if (!isa<Instruction>(V))
+ return false;
for (TreeEntry *E : getTreeEntries(V)) {
- if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
+ if (E->UserTreeIndex == EdgeInfo(UserTE, OpIdx)) {
TE = E;
return true;
}
@@ -3031,7 +3040,7 @@ class BoUpSLP {
/// of a vector of (the same) instruction.
TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
- /// \ returns the graph entry for the \p Idx operand of the \p E entry.
+ /// \returns the graph entry for the \p Idx operand of the \p E entry.
const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
/// Gets the root instruction for the given node. If the node is a strided
@@ -3070,10 +3079,15 @@ class BoUpSLP {
/// Returns vectorized operand node, that matches the order of the scalars
/// operand number \p NodeIdx in entry \p E.
- TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
- const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
- unsigned NodeIdx) const {
- return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
+ TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
+ ArrayRef<Value *> VL,
+ const InstructionsState &S);
+ const TreeEntry *
+ getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
+ ArrayRef<Value *> VL,
+ const InstructionsState &S) const {
+ return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx,
+ VL, S);
}
/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
@@ -3249,9 +3263,8 @@ class BoUpSLP {
}
bool isOperandGatherNode(const EdgeInfo &UserEI) const {
- return isGather() && !UserTreeIndices.empty() &&
- UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
- UserTreeIndices.front().UserTE == UserEI.UserTE;
+ return isGather() && UserTreeIndex.EdgeIdx == UserEI.EdgeIdx &&
+ UserTreeIndex.UserTE == UserEI.UserTE;
}
/// \returns true if current entry has same operands as \p TE.
@@ -3335,7 +3348,7 @@ class BoUpSLP {
/// The TreeEntry index containing the user of this entry. We can actually
/// have multiple users so the data structure is not truly a tree.
- SmallVector<EdgeInfo, 1> UserTreeIndices;
+ EdgeInfo UserTreeIndex;
/// The index of this treeEntry in VectorizableTree.
unsigned Idx = 0;
@@ -3559,9 +3572,9 @@ class BoUpSLP {
for (unsigned ReorderIdx : ReorderIndices)
dbgs() << ReorderIdx << ", ";
dbgs() << "\n";
- dbgs() << "UserTreeIndices: ";
- for (const auto &EInfo : UserTreeIndices)
- dbgs() << EInfo << ", ";
+ dbgs() << "UserTreeIndex: ";
+ if (UserTreeIndex)
+ dbgs() << UserTreeIndex;
dbgs() << "\n";
if (!CombinedEntriesWithIndices.empty()) {
dbgs() << "Combined entries: ";
@@ -3707,7 +3720,7 @@ class BoUpSLP {
}
if (UserTreeIdx.UserTE)
- Last->UserTreeIndices.push_back(UserTreeIdx);
+ Last->UserTreeIndex = UserTreeIdx;
return Last;
}
@@ -4463,11 +4476,11 @@ template <> struct GraphTraits<BoUpSLP *> {
}
static ChildIteratorType child_begin(NodeRef N) {
- return {N->UserTreeIndices.begin(), N->Container};
+ return {&N->UserTreeIndex, N->Container};
}
static ChildIteratorType child_end(NodeRef N) {
- return {N->UserTreeIndices.end(), N->Container};
+ return {&N->UserTreeIndex + 1, N->Container};
}
/// For the node iterator we just need to turn the TreeEntry iterator into a
@@ -4632,7 +4645,8 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
}
std::optional<BoUpSLP::OrdersType>
-BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
+BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
+ bool TopToBottom, bool IgnoreReorder) {
assert(TE.isGather() && "Expected gather node only.");
// Try to find subvector extract/insert patterns and reorder only such
// patterns.
@@ -4658,6 +4672,26 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
if (GatherShuffles.size() == 1 &&
*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
Entries.front().front()->isSame(TE.Scalars)) {
+ // If the full matched node in whole tree rotation - no need to consider the
+ // matching order, rotating the whole tree.
+ if (TopToBottom)
+ return std::nullopt;
+ // No need to keep the order for the same user node.
+ if (Entries.front().front()->UserTreeIndex.UserTE ==
+ TE.UserTreeIndex.UserTE)
+ return std::nullopt;
+ // No need to keep the order for the matched root node, if it can be freely
+ // reordered.
+ if (!IgnoreReorder && Entries.front().front()->Idx == 0)
+ return std::nullopt;
+ // If shuffling 2 elements only and the matching node has reverse reuses -
+ // no need to count order, both work fine.
+ if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
+ TE.getVectorFactor() == 2 && Mask.size() == 2 &&
+ any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
+ [](const auto &P) { return P.value() % 2 != P.index() % 2; }))
+ return std::nullopt;
+
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
@@ -5557,7 +5591,8 @@ static bool areTwoInsertFromSameBuildVector(
}
std::optional<BoUpSLP::OrdersType>
-BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
+BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
+ bool IgnoreReorder) {
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE.ReuseShuffleIndices.empty()) {
@@ -5577,7 +5612,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
unsigned Sz = TE.Scalars.size();
if (TE.isGather()) {
if (std::optional<OrdersType> CurrentOrder =
- findReusedOrderedScalars(TE)) {
+ findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
SmallVector<int> Mask;
fixupOrderingIndices(*CurrentOrder);
inversePermutation(*CurrentOrder, Mask);
@@ -5680,10 +5715,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
return std::move(ResOrder);
}
if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
- any_of(TE.UserTreeIndices,
- [](const EdgeInfo &EI) {
- return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
- }) &&
+ (!TE.UserTreeIndex ||
+ !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
(TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
return std::nullopt;
if ((TE.State == TreeEntry::Vectorize ||
@@ -5873,7 +5906,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
// has been auditted for correctness with non-power-of-two vectors.
if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
- if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
+ if (std::optional<OrdersType> CurrentOrder =
+ findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
return CurrentOrder;
}
return std::nullopt;
@@ -5943,6 +5977,67 @@ static void combineOrders(MutableArrayRef<unsigned> Order,
}
}
+bool BoUpSLP::isProfitableToReorder() const {
+ constexpr unsigned TinyVF = 2;
+ constexpr unsigned TinyTree = 10;
+ if (VectorizableTree.size() <= TinyTree)
+ return true;
+ if (VectorizableTree.front()->hasState() &&
+ !VectorizableTree.front()->isGather() &&
+ (VectorizableTree.front()->getOpcode() == Instruction::Store ||
+ VectorizableTree.front()->getOpcode() == Instruction::PHI ||
+ (VectorizableTree.front()->getVectorFactor() == TinyVF &&
+ (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
+ VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
+ VectorizableTree.front()->ReorderIndices.empty()) {
+ constexpr unsigned PhiOpsLimit = 12;
+ // Check if the tree has only single store and single (unordered) load node,
+ // other nodes are phis or geps/binops, combined with phis, and/orsingle
+ // gather load node
+ bool HasPhis = false;
+ if (VectorizableTree.front()->getOpcode() == Instruction::PHI &&
+ VectorizableTree.front()->Scalars.size() == TinyVF &&
+ VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
+ return false;
+ bool HasLoad = true;
+ unsigned GatherLoads = 0;
+ for (const std::unique_ptr<TreeEntry> &TE :
+ ArrayRef(VectorizableTree).drop_front()) {
+ if (!TE->hasState()) {
+ if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
+ all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
+ continue;
+ if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+ any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))
+ continue;
+ return true;
+ }
+ if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
+ if (!TE->isGather()) {
+ HasLoad = false;
+ continue;
+ }
+ if (HasLoad)
+ return true;
+ ++GatherLoads;
+ if (GatherLoads >= 2)
+ return true;
+ }
+ if (TE->getOpcode() == Instruction::GetElementPtr ||
+ Instruction::isBinaryOp(TE->getOpcode()))
+ continue;
+ if (TE->getOpcode() != Instruction::PHI)
+ return true;
+ if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+ TE->getNumOperands() > PhiOpsLimit)
+ return false;
+ HasPhis = true;
+ }
+ return !HasPhis;
+ }
+ return true;
+}
+
void BoUpSLP::reorderTopToBottom() {
// Maps VF to the graph nodes.
DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
@@ -5991,8 +6086,12 @@ void BoUpSLP::reorderTopToBottom() {
// TODO: Check the reverse order too.
}
+ bool IgnoreReorder =
+ !UserIgnoreList && VectorizableTree.front()->hasState() &&
+ (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
+ VectorizableTree.front()->getOpcode() == Instruction::Store);
if (std::optional<OrdersType> CurrentOrder =
- getReorderingData(*TE, /*TopToBottom=*/true)) {
+ getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
// Do not include ordering for nodes used in the alt opcode vectorization,
// better to reorder them during bottom-to-top stage. If follow the order
// here, it causes reordering of the whole graph though actually it is
@@ -6003,14 +6102,13 @@ void BoUpSLP::reorderTopToBottom() {
unsigned Cnt = 0;
const TreeEntry *UserTE = TE.get();
while (UserTE && Cnt < RecursionMaxDepth) {
- if (UserTE->UserTreeIndices.size() != 1)
+ if (!UserTE->UserTreeIndex)
break;
- if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
- return EI.UserTE->State == TreeEntry::Vectorize &&
- EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
- }))
+ if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
+ UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
+ UserTE->UserTreeIndex.UserTE->Idx != 0)
return;
- UserTE = UserTE->UserTreeIndices.back().UserTE;
+ UserTE = UserTE->UserTreeIndex.UserTE;
++Cnt;
}
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
@@ -6156,12 +6254,10 @@ void BoUpSLP::reorderTopToBottom() {
// Need to reorder the reuses masks of the operands with smaller VF to
// be able to find the match between the graph nodes and scalar
// operands of the given node during vectorization/cost estimation.
- assert(all_of(TE->UserTreeIndices,
- [VF, &TE](const EdgeInfo &EI) {
- return EI.UserTE->Scalars.size() == VF ||
- EI.UserTE->Scalars.size() ==
- TE->Scalars.size();
- }) &&
+ assert((!TE->UserTreeIndex ||
+ TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
+ TE->UserTreeIndex.UserTE->Scalars.size() ==
+ TE->Scalars.size()) &&
"All users must be of VF size.");
if (SLPReVec) {
assert(SLPReVec && "Only supported by REVEC.");
@@ -6169,15 +6265,11 @@ void BoUpSLP::reorderTopToBottom() {
// because ShuffleVectorInst supports only a limited set of
// patterns). Only do reorderNodeWithReuses if all of the users are
// not ShuffleVectorInst.
- if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
- return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
- }))
+ if (isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
continue;
- assert(none_of(TE->UserTreeIndices,
- [&](const EdgeInfo &EI) {
- return isa<ShuffleVectorInst>(
- EI.UserTE->getMainOp());
- }) &&
+ assert((!TE->UserTreeIndex ||
+ !isa<ShuffleVectorInst>(
+ TE->UserTreeIndex.UserTE->getMainOp())) &&
"Does not know how to reorder.");
}
// Update ordering of the operands with the smaller VF than the given
@@ -6232,10 +6324,6 @@ bool BoUpSLP::canReorderOperands(
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
- // Do not reorder if operand node is used by many user nodes.
- if (any_of(TE->UserTreeIndices,
- [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
- return false;
// Add the node to the list of the ordered nodes with the identity
// order.
Edges.emplace_back(I, TE);
@@ -6256,10 +6344,8 @@ bool BoUpSLP::canReorderOperands(
assert(TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
"Only non-vectorized nodes are expected.");
- if (any_of(TE->UserTreeIndices,
- [UserTE, I](const EdgeInfo &EI) {
- return EI.UserTE == UserTE && EI.EdgeIdx == I;
- })) {
+ if (TE->UserTreeIndex.UserTE == UserTE &&
+ TE->UserTreeIndex.EdgeIdx == I) {
assert(TE->isSame(UserTE->getOperand(I)) &&
"Operand entry does not match operands.");
Gather = TE;
@@ -6286,8 +6372,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize)
NonVectorized.push_back(TE.get());
- if (std::optional<OrdersType> CurrentOrder =
- getReorderingData(*TE, /*TopToBottom=*/false)) {
+ if (std::optional<OrdersType> CurrentOrder = getReorderingData(
+ *TE, /*TopToBottom=*/false, IgnoreReorder)) {
OrderedEntries.insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize) ||
@@ -6300,29 +6386,24 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// I.e., if the node has operands, that are reordered, try to make at least
// one operand order in the natural order and reorder others + reorder the
// user node itself.
- SmallPtrSet<const TreeEntry *, 4> Visited;
+ SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
while (!OrderedEntries.empty()) {
// 1. Filter out only reordered nodes.
- // 2. If the entry has multiple uses - skip it and jump to the next node.
DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
SmallVector<TreeEntry *> Filtered;
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
(TE->isGather() && GathersToOrders.contains(TE))) ||
- TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
- !all_of(drop_begin(TE->UserTreeIndices),
- [TE](const EdgeInfo &EI) {
- return EI.UserTE == TE->UserTreeIndices.front().UserTE;
- }) ||
+ !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesWe can stop using a graph representation of the SLP structure and switch AVX512, -O3+LTO ASCI_Purple/SMG2000 - extra code vectorized, small variations RISCV, -O3+LTO gcc-c-torture/execute/GCC-C-execute-pr28982b - better vector code Patch is 89.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126771.diff 17 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e1c08077126db..6f9c62c6218b2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1409,12 +1409,6 @@ class BoUpSLP {
/// Construct a vectorizable tree that starts at \p Roots.
void buildTree(ArrayRef<Value *> Roots);
- /// Returns whether the root node has in-tree uses.
- bool doesRootHaveInTreeUses() const {
- return !VectorizableTree.empty() &&
- !VectorizableTree.front()->UserTreeIndices.empty();
- }
-
/// Return the scalars of the root node.
ArrayRef<Value *> getRootNodeScalars() const {
assert(!VectorizableTree.empty() && "No graph to get the first node from");
@@ -1524,7 +1518,12 @@ class BoUpSLP {
/// shuffled vector entry + (possibly) permutation with other gathers. It
/// implements the checks only for possibly ordered scalars (Loads,
/// ExtractElement, ExtractValue), which can be part of the graph.
- std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
+ /// \param TopToBottom If true, used for the whole tree rotation, false - for
+ /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
+ /// node might be ignored.
+ std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
+ bool TopToBottom,
+ bool IgnoreReorder);
/// Sort loads into increasing pointers offsets to allow greater clustering.
std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
@@ -1536,8 +1535,14 @@ class BoUpSLP {
/// identity order is important, or the actual order.
/// \param TopToBottom If true, include the order of vectorized stores and
/// insertelement nodes, otherwise skip them.
- std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
- bool TopToBottom);
+ /// \param IgnoreReorder true, if the root node order can be ignored.
+ std::optional<OrdersType>
+ getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
+
+ /// Checks if it is profitable to reorder the current tree.
+ /// If the tree does not contain many profitable reordable nodes, better to
+ /// skip it to save compile time.
+ bool isProfitableToReorder() const;
/// Reorders the current graph to the most profitable order starting from the
/// root node to the leaf nodes. The best order is chosen only from the nodes
@@ -1680,6 +1685,8 @@ class BoUpSLP {
bool operator == (const EdgeInfo &Other) const {
return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
}
+
+ operator bool() const { return UserTE != nullptr; }
};
/// A helper class used for scoring candidates for two consecutive lanes.
@@ -2999,8 +3006,10 @@ class BoUpSLP {
ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
TreeEntry *TE = nullptr;
const auto *It = find_if(VL, [&](Value *V) {
+ if (!isa<Instruction>(V))
+ return false;
for (TreeEntry *E : getTreeEntries(V)) {
- if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
+ if (E->UserTreeIndex == EdgeInfo(UserTE, OpIdx)) {
TE = E;
return true;
}
@@ -3031,7 +3040,7 @@ class BoUpSLP {
/// of a vector of (the same) instruction.
TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
- /// \ returns the graph entry for the \p Idx operand of the \p E entry.
+ /// \returns the graph entry for the \p Idx operand of the \p E entry.
const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
/// Gets the root instruction for the given node. If the node is a strided
@@ -3070,10 +3079,15 @@ class BoUpSLP {
/// Returns vectorized operand node, that matches the order of the scalars
/// operand number \p NodeIdx in entry \p E.
- TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
- const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
- unsigned NodeIdx) const {
- return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
+ TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
+ ArrayRef<Value *> VL,
+ const InstructionsState &S);
+ const TreeEntry *
+ getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
+ ArrayRef<Value *> VL,
+ const InstructionsState &S) const {
+ return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx,
+ VL, S);
}
/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
@@ -3249,9 +3263,8 @@ class BoUpSLP {
}
bool isOperandGatherNode(const EdgeInfo &UserEI) const {
- return isGather() && !UserTreeIndices.empty() &&
- UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
- UserTreeIndices.front().UserTE == UserEI.UserTE;
+ return isGather() && UserTreeIndex.EdgeIdx == UserEI.EdgeIdx &&
+ UserTreeIndex.UserTE == UserEI.UserTE;
}
/// \returns true if current entry has same operands as \p TE.
@@ -3335,7 +3348,7 @@ class BoUpSLP {
/// The TreeEntry index containing the user of this entry. We can actually
/// have multiple users so the data structure is not truly a tree.
- SmallVector<EdgeInfo, 1> UserTreeIndices;
+ EdgeInfo UserTreeIndex;
/// The index of this treeEntry in VectorizableTree.
unsigned Idx = 0;
@@ -3559,9 +3572,9 @@ class BoUpSLP {
for (unsigned ReorderIdx : ReorderIndices)
dbgs() << ReorderIdx << ", ";
dbgs() << "\n";
- dbgs() << "UserTreeIndices: ";
- for (const auto &EInfo : UserTreeIndices)
- dbgs() << EInfo << ", ";
+ dbgs() << "UserTreeIndex: ";
+ if (UserTreeIndex)
+ dbgs() << UserTreeIndex;
dbgs() << "\n";
if (!CombinedEntriesWithIndices.empty()) {
dbgs() << "Combined entries: ";
@@ -3707,7 +3720,7 @@ class BoUpSLP {
}
if (UserTreeIdx.UserTE)
- Last->UserTreeIndices.push_back(UserTreeIdx);
+ Last->UserTreeIndex = UserTreeIdx;
return Last;
}
@@ -4463,11 +4476,11 @@ template <> struct GraphTraits<BoUpSLP *> {
}
static ChildIteratorType child_begin(NodeRef N) {
- return {N->UserTreeIndices.begin(), N->Container};
+ return {&N->UserTreeIndex, N->Container};
}
static ChildIteratorType child_end(NodeRef N) {
- return {N->UserTreeIndices.end(), N->Container};
+ return {&N->UserTreeIndex + 1, N->Container};
}
/// For the node iterator we just need to turn the TreeEntry iterator into a
@@ -4632,7 +4645,8 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
}
std::optional<BoUpSLP::OrdersType>
-BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
+BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
+ bool TopToBottom, bool IgnoreReorder) {
assert(TE.isGather() && "Expected gather node only.");
// Try to find subvector extract/insert patterns and reorder only such
// patterns.
@@ -4658,6 +4672,26 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
if (GatherShuffles.size() == 1 &&
*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
Entries.front().front()->isSame(TE.Scalars)) {
+ // If the full matched node in whole tree rotation - no need to consider the
+ // matching order, rotating the whole tree.
+ if (TopToBottom)
+ return std::nullopt;
+ // No need to keep the order for the same user node.
+ if (Entries.front().front()->UserTreeIndex.UserTE ==
+ TE.UserTreeIndex.UserTE)
+ return std::nullopt;
+ // No need to keep the order for the matched root node, if it can be freely
+ // reordered.
+ if (!IgnoreReorder && Entries.front().front()->Idx == 0)
+ return std::nullopt;
+ // If shuffling 2 elements only and the matching node has reverse reuses -
+ // no need to count order, both work fine.
+ if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
+ TE.getVectorFactor() == 2 && Mask.size() == 2 &&
+ any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
+ [](const auto &P) { return P.value() % 2 != P.index() % 2; }))
+ return std::nullopt;
+
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
@@ -5557,7 +5591,8 @@ static bool areTwoInsertFromSameBuildVector(
}
std::optional<BoUpSLP::OrdersType>
-BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
+BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
+ bool IgnoreReorder) {
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE.ReuseShuffleIndices.empty()) {
@@ -5577,7 +5612,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
unsigned Sz = TE.Scalars.size();
if (TE.isGather()) {
if (std::optional<OrdersType> CurrentOrder =
- findReusedOrderedScalars(TE)) {
+ findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
SmallVector<int> Mask;
fixupOrderingIndices(*CurrentOrder);
inversePermutation(*CurrentOrder, Mask);
@@ -5680,10 +5715,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
return std::move(ResOrder);
}
if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
- any_of(TE.UserTreeIndices,
- [](const EdgeInfo &EI) {
- return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
- }) &&
+ (!TE.UserTreeIndex ||
+ !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
(TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
return std::nullopt;
if ((TE.State == TreeEntry::Vectorize ||
@@ -5873,7 +5906,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
// has been auditted for correctness with non-power-of-two vectors.
if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
- if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
+ if (std::optional<OrdersType> CurrentOrder =
+ findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
return CurrentOrder;
}
return std::nullopt;
@@ -5943,6 +5977,67 @@ static void combineOrders(MutableArrayRef<unsigned> Order,
}
}
+bool BoUpSLP::isProfitableToReorder() const {
+ constexpr unsigned TinyVF = 2;
+ constexpr unsigned TinyTree = 10;
+ if (VectorizableTree.size() <= TinyTree)
+ return true;
+ if (VectorizableTree.front()->hasState() &&
+ !VectorizableTree.front()->isGather() &&
+ (VectorizableTree.front()->getOpcode() == Instruction::Store ||
+ VectorizableTree.front()->getOpcode() == Instruction::PHI ||
+ (VectorizableTree.front()->getVectorFactor() == TinyVF &&
+ (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
+ VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
+ VectorizableTree.front()->ReorderIndices.empty()) {
+ constexpr unsigned PhiOpsLimit = 12;
+ // Check if the tree has only single store and single (unordered) load node,
+ // other nodes are phis or geps/binops, combined with phis, and/orsingle
+ // gather load node
+ bool HasPhis = false;
+ if (VectorizableTree.front()->getOpcode() == Instruction::PHI &&
+ VectorizableTree.front()->Scalars.size() == TinyVF &&
+ VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
+ return false;
+ bool HasLoad = true;
+ unsigned GatherLoads = 0;
+ for (const std::unique_ptr<TreeEntry> &TE :
+ ArrayRef(VectorizableTree).drop_front()) {
+ if (!TE->hasState()) {
+ if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
+ all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
+ continue;
+ if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+ any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))
+ continue;
+ return true;
+ }
+ if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
+ if (!TE->isGather()) {
+ HasLoad = false;
+ continue;
+ }
+ if (HasLoad)
+ return true;
+ ++GatherLoads;
+ if (GatherLoads >= 2)
+ return true;
+ }
+ if (TE->getOpcode() == Instruction::GetElementPtr ||
+ Instruction::isBinaryOp(TE->getOpcode()))
+ continue;
+ if (TE->getOpcode() != Instruction::PHI)
+ return true;
+ if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+ TE->getNumOperands() > PhiOpsLimit)
+ return false;
+ HasPhis = true;
+ }
+ return !HasPhis;
+ }
+ return true;
+}
+
void BoUpSLP::reorderTopToBottom() {
// Maps VF to the graph nodes.
DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
@@ -5991,8 +6086,12 @@ void BoUpSLP::reorderTopToBottom() {
// TODO: Check the reverse order too.
}
+ bool IgnoreReorder =
+ !UserIgnoreList && VectorizableTree.front()->hasState() &&
+ (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
+ VectorizableTree.front()->getOpcode() == Instruction::Store);
if (std::optional<OrdersType> CurrentOrder =
- getReorderingData(*TE, /*TopToBottom=*/true)) {
+ getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
// Do not include ordering for nodes used in the alt opcode vectorization,
// better to reorder them during bottom-to-top stage. If follow the order
// here, it causes reordering of the whole graph though actually it is
@@ -6003,14 +6102,13 @@ void BoUpSLP::reorderTopToBottom() {
unsigned Cnt = 0;
const TreeEntry *UserTE = TE.get();
while (UserTE && Cnt < RecursionMaxDepth) {
- if (UserTE->UserTreeIndices.size() != 1)
+ if (!UserTE->UserTreeIndex)
break;
- if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
- return EI.UserTE->State == TreeEntry::Vectorize &&
- EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
- }))
+ if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
+ UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
+ UserTE->UserTreeIndex.UserTE->Idx != 0)
return;
- UserTE = UserTE->UserTreeIndices.back().UserTE;
+ UserTE = UserTE->UserTreeIndex.UserTE;
++Cnt;
}
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
@@ -6156,12 +6254,10 @@ void BoUpSLP::reorderTopToBottom() {
// Need to reorder the reuses masks of the operands with smaller VF to
// be able to find the match between the graph nodes and scalar
// operands of the given node during vectorization/cost estimation.
- assert(all_of(TE->UserTreeIndices,
- [VF, &TE](const EdgeInfo &EI) {
- return EI.UserTE->Scalars.size() == VF ||
- EI.UserTE->Scalars.size() ==
- TE->Scalars.size();
- }) &&
+ assert((!TE->UserTreeIndex ||
+ TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
+ TE->UserTreeIndex.UserTE->Scalars.size() ==
+ TE->Scalars.size()) &&
"All users must be of VF size.");
if (SLPReVec) {
assert(SLPReVec && "Only supported by REVEC.");
@@ -6169,15 +6265,11 @@ void BoUpSLP::reorderTopToBottom() {
// because ShuffleVectorInst supports only a limited set of
// patterns). Only do reorderNodeWithReuses if all of the users are
// not ShuffleVectorInst.
- if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
- return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
- }))
+ if (isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
continue;
- assert(none_of(TE->UserTreeIndices,
- [&](const EdgeInfo &EI) {
- return isa<ShuffleVectorInst>(
- EI.UserTE->getMainOp());
- }) &&
+ assert((!TE->UserTreeIndex ||
+ !isa<ShuffleVectorInst>(
+ TE->UserTreeIndex.UserTE->getMainOp())) &&
"Does not know how to reorder.");
}
// Update ordering of the operands with the smaller VF than the given
@@ -6232,10 +6324,6 @@ bool BoUpSLP::canReorderOperands(
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
- // Do not reorder if operand node is used by many user nodes.
- if (any_of(TE->UserTreeIndices,
- [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
- return false;
// Add the node to the list of the ordered nodes with the identity
// order.
Edges.emplace_back(I, TE);
@@ -6256,10 +6344,8 @@ bool BoUpSLP::canReorderOperands(
assert(TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
"Only non-vectorized nodes are expected.");
- if (any_of(TE->UserTreeIndices,
- [UserTE, I](const EdgeInfo &EI) {
- return EI.UserTE == UserTE && EI.EdgeIdx == I;
- })) {
+ if (TE->UserTreeIndex.UserTE == UserTE &&
+ TE->UserTreeIndex.EdgeIdx == I) {
assert(TE->isSame(UserTE->getOperand(I)) &&
"Operand entry does not match operands.");
Gather = TE;
@@ -6286,8 +6372,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize)
NonVectorized.push_back(TE.get());
- if (std::optional<OrdersType> CurrentOrder =
- getReorderingData(*TE, /*TopToBottom=*/false)) {
+ if (std::optional<OrdersType> CurrentOrder = getReorderingData(
+ *TE, /*TopToBottom=*/false, IgnoreReorder)) {
OrderedEntries.insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize) ||
@@ -6300,29 +6386,24 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// I.e., if the node has operands, that are reordered, try to make at least
// one operand order in the natural order and reorder others + reorder the
// user node itself.
- SmallPtrSet<const TreeEntry *, 4> Visited;
+ SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
while (!OrderedEntries.empty()) {
// 1. Filter out only reordered nodes.
- // 2. If the entry has multiple uses - skip it and jump to the next node.
DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
SmallVector<TreeEntry *> Filtered;
for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
(TE->isGather() && GathersToOrders.contains(TE))) ||
- TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
- !all_of(drop_begin(TE->UserTreeIndices),
- [TE](const EdgeInfo &EI) {
- return EI.UserTE == TE->UserTreeIndices.front().UserTE;
- }) ||
+ !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
Ping! |
cc: @lukel97 |
I think this might fix the regression in #124499? |
I hope so, especially being combined with #123360, which I postponed before landing this patch |
Created using spr 1.3.5
✅ With the latest revision this PR passed the undef deprecator. |
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SGTM, but its a big refactor and I'm not sure if there's anyway to break it down at all?
VectorizableTree.front()->getOpcode() == Instruction::ICmp))) && | ||
VectorizableTree.front()->ReorderIndices.empty()) { | ||
constexpr unsigned PhiOpsLimit = 12; | ||
constexpr unsigned GatherLoadsLimit = 2; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hoist this to the top?
Cannot split, have to commit it as is, all changes depend on each other |
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
We can stop using a graph representation of the SLP structure and switch directly to tree by relying on a single user of each tree node. If the node has multiple uses, other uses must be represented as a separate gather/buildvector node, which then will be combined with the existing vectorized node(s) uoon cost estimation/codegen. This allow to simplify inner structure and turn in some extra optimizations, which could not be turned on for the nodes with multi users (reordering, minbitwidth analysis). AVX512, -O3+LTO Metric: size..text results results0 diff test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 253453.00 254253.00 0.3% test-suite :: External/SPEC/CFP2006/444.namd/444.namd.test 251411.00 252051.00 0.3% test-suite :: SingleSource/Benchmarks/Misc/oourafft.test 19114.00 19146.00 0.2% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1399200.00 1399520.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1399200.00 1399520.00 0.0% test-suite :: MicroBenchmarks/LCALS/SubsetALambdaLoops/lcalsALambda.test 304310.00 304326.00 0.0% test-suite :: MicroBenchmarks/LCALS/SubsetARawLoops/lcalsARaw.test 304662.00 304678.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12566919.00 12567511.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1146300.00 1146316.00 0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1159864.00 1159880.00 0.0% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 9407880.00 9407864.00 -0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 9407880.00 9407864.00 -0.0% test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1011612.00 1011596.00 -0.0% test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 280584.00 280536.00 -0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 93016.00 93000.00 -0.0% ASCI_Purple/SMG2000 - extra code vectorized, small variations CFP2006/444.namd - small variations, less shuffles Benchmarks/Misc/oourafft - small variations CFP2017rate/538.imagick_r CFP2017speed/638.imagick_s - small variations, less shuffles LCALS/SubsetALambdaLoops - less shuffles LCALS/SubsetARawLoops - less shuffles CFP2017rate/526.blender_r - small variations, extra vector code CFP2006/453.povray - small variations CFP2017rate/511.povray_r - small variations CINT2017rate/502.gcc_r CINT2017speed/602.gcc_s - small variations Benchmarks/tramp3d-v4 - small variations Prolangs-C/TimberWolfMC - small variations DOE-ProxyApps-C++/miniFE - extra code vectorized, small variations DOE-ProxyApps-C++/CLAMR - extra code vectorized, small variations ASCI_Purple/SMG2000 - no significant changes RISCV, -O3+LTO Metric: size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-pr28982b.test 1812.00 1866.00 3.0% test-suite :: MultiSource/Benchmarks/Olden/health/health.test 3946.00 4016.00 1.8% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 513180.00 513550.00 0.1% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 513180.00 513550.00 0.1% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 7672198.00 7672202.00 0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 7672198.00 7672202.00 0.0% test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test 746060.00 746044.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9497716.00 9497364.00 -0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 948266.00 948214.00 -0.0% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 89874.00 89862.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 835492.00 835346.00 -0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 66230.00 66202.00 -0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 946090.00 944206.00 -0.2% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1136404.00 1131854.00 -0.4% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1136404.00 1131854.00 -0.4% gcc-c-torture/execute/GCC-C-execute-pr28982b - better vector code Olden/health - extra vector code CINT2017speed/625.x264_s CINT2017rate/525.x264_r - small variation + improvements in reordering, @pixel_hadamard_ac stopped being vectorized because of some non-effective shuffle recognition by the compiler CINT2017rate/502.gcc_r CINT2017speed/602.gcc_s - small variations CFP2017rate/508.namd_r - small variations CFP2017rate/526.blender_r - small variations CFP2006/453.povray - extra vector code Benchmarks/7zip - extra vector code DOE-ProxyApps-C++/miniFE - small variations CFP2017rate/511.povray_r - extra vector code CFP2017speed/638.imagick_s CFP2017rate/538.imagick_r - extra vector code Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: llvm/llvm-project#126771
@alexey-bataev It seems odd to insist on handling vectorizable DAGs as individual sub-trees with external users. This seems to take us further from being able to eventually cost multiple roots in one go (i.e. trees with share sub-expressions). Practical engineering is filed with tradeoffs, so if you're reasonable sure this is the right direction, please continue. I'm just noting the question. |
We still support this, but not as a DAG, but as a smart matching of the nodes, so we don't miss anything here. It could be a problem before, but not now, since we support effective matching for gathered nodes against vector nodes. |
@alexey-bataev heads-up we see at least a compiler crash in the |
Thanks, waiting for repro |
Not sure if it's the same issue but I found something: #130082 |
Compilation command:
Can you please consider reverting given the crash and @usatiuk's report (nondeterminism + miscompile)? |
It's going to be very hard to revert it, there are already several changes on top of it. I will fix all reported issues today. |
Fixed in 1182be5 |
We can stop using a graph representation of the SLP structure and switch directly to tree by relying on a single user of each tree node. If the node has multiple uses, other uses must be represented as a separate gather/buildvector node, which then will be combined with the existing vectorized node(s) uoon cost estimation/codegen. This allow to simplify inner structure and turn in some extra optimizations, which could not be turned on for the nodes with multi users (reordering, minbitwidth analysis). AVX512, -O3+LTO Metric: size..text results results0 diff test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 253453.00 254253.00 0.3% test-suite :: External/SPEC/CFP2006/444.namd/444.namd.test 251411.00 252051.00 0.3% test-suite :: SingleSource/Benchmarks/Misc/oourafft.test 19114.00 19146.00 0.2% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1399200.00 1399520.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1399200.00 1399520.00 0.0% test-suite :: MicroBenchmarks/LCALS/SubsetALambdaLoops/lcalsALambda.test 304310.00 304326.00 0.0% test-suite :: MicroBenchmarks/LCALS/SubsetARawLoops/lcalsARaw.test 304662.00 304678.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12566919.00 12567511.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1146300.00 1146316.00 0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1159864.00 1159880.00 0.0% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 9407880.00 9407864.00 -0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 9407880.00 9407864.00 -0.0% test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1011612.00 1011596.00 -0.0% test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 280584.00 280536.00 -0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 93016.00 93000.00 -0.0% ASCI_Purple/SMG2000 - extra code vectorized, small variations CFP2006/444.namd - small variations, less shuffles Benchmarks/Misc/oourafft - small variations CFP2017rate/538.imagick_r CFP2017speed/638.imagick_s - small variations, less shuffles LCALS/SubsetALambdaLoops - less shuffles LCALS/SubsetARawLoops - less shuffles CFP2017rate/526.blender_r - small variations, extra vector code CFP2006/453.povray - small variations CFP2017rate/511.povray_r - small variations CINT2017rate/502.gcc_r CINT2017speed/602.gcc_s - small variations Benchmarks/tramp3d-v4 - small variations Prolangs-C/TimberWolfMC - small variations DOE-ProxyApps-C++/miniFE - extra code vectorized, small variations DOE-ProxyApps-C++/CLAMR - extra code vectorized, small variations ASCI_Purple/SMG2000 - no significant changes RISCV, -O3+LTO Metric: size..text results results0 diff test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-pr28982b.test 1812.00 1866.00 3.0% test-suite :: MultiSource/Benchmarks/Olden/health/health.test 3946.00 4016.00 1.8% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 513180.00 513550.00 0.1% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 513180.00 513550.00 0.1% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 7672198.00 7672202.00 0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 7672198.00 7672202.00 0.0% test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test 746060.00 746044.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9497716.00 9497364.00 -0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 948266.00 948214.00 -0.0% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 89874.00 89862.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 835492.00 835346.00 -0.0% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 66230.00 66202.00 -0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 946090.00 944206.00 -0.2% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1136404.00 1131854.00 -0.4% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1136404.00 1131854.00 -0.4% gcc-c-torture/execute/GCC-C-execute-pr28982b - better vector code Olden/health - extra vector code CINT2017speed/625.x264_s CINT2017rate/525.x264_r - small variation + improvements in reordering, @pixel_hadamard_ac stopped being vectorized because of some non-effective shuffle recognition by the compiler CINT2017rate/502.gcc_r CINT2017speed/602.gcc_s - small variations CFP2017rate/508.namd_r - small variations CFP2017rate/526.blender_r - small variations CFP2006/453.povray - extra vector code Benchmarks/7zip - extra vector code DOE-ProxyApps-C++/miniFE - small variations CFP2017rate/511.povray_r - extra vector code CFP2017speed/638.imagick_s CFP2017rate/538.imagick_r - extra vector code Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: llvm/llvm-project#126771
We can stop using a graph representation of the SLP structure and switch
directly to tree by relying on a single user of each tree node. If the
node has multiple uses, other uses must be represented as a separate
gather/buildvector node, which then will be combined with the existing
vectorized node(s) uoon cost estimation/codegen.
This allow to simplify inner structure and turn in some extra
optimizations, which could not be turned on for the nodes with multi
users (reordering, minbitwidth analysis).
AVX512, -O3+LTO
Metric: size..text
results results0 diff
test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 253453.00 254253.00 0.3%
test-suite :: External/SPEC/CFP2006/444.namd/444.namd.test 251411.00 252051.00 0.3%
test-suite :: SingleSource/Benchmarks/Misc/oourafft.test 19114.00 19146.00 0.2%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1399200.00 1399520.00 0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1399200.00 1399520.00 0.0%
test-suite :: MicroBenchmarks/LCALS/SubsetALambdaLoops/lcalsALambda.test 304310.00 304326.00 0.0%
test-suite :: MicroBenchmarks/LCALS/SubsetARawLoops/lcalsARaw.test 304662.00 304678.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12566919.00 12567511.00 0.0%
test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1146300.00 1146316.00 0.0%
test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1159864.00 1159880.00 0.0%
test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 9407880.00 9407864.00 -0.0%
test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 9407880.00 9407864.00 -0.0%
test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1011612.00 1011596.00 -0.0%
test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 280584.00 280536.00 -0.0%
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 93016.00 93000.00 -0.0%
ASCI_Purple/SMG2000 - extra code vectorized, small variations
CFP2006/444.namd - small variations, less shuffles
Benchmarks/Misc/oourafft - small variations
CFP2017rate/538.imagick_r
CFP2017speed/638.imagick_s - small variations, less shuffles
LCALS/SubsetALambdaLoops - less shuffles
LCALS/SubsetARawLoops - less shuffles
CFP2017rate/526.blender_r - small variations, extra vector code
CFP2006/453.povray - small variations
CFP2017rate/511.povray_r - small variations
CINT2017rate/502.gcc_r
CINT2017speed/602.gcc_s - small variations
Benchmarks/tramp3d-v4 - small variations
Prolangs-C/TimberWolfMC - small variations
DOE-ProxyApps-C++/miniFE - extra code vectorized, small variations
DOE-ProxyApps-C++/CLAMR - extra code vectorized, small variations
ASCI_Purple/SMG2000 - no significant changes
RISCV, -O3+LTO
Metric: size..text
results results0 diff
test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-pr28982b.test 1812.00 1866.00 3.0%
test-suite :: MultiSource/Benchmarks/Olden/health/health.test 3946.00 4016.00 1.8%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 513180.00 513550.00 0.1%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 513180.00 513550.00 0.1%
test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 7672198.00 7672202.00 0.0%
test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 7672198.00 7672202.00 0.0%
test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test 746060.00 746044.00 -0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9497716.00 9497364.00 -0.0%
test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 948266.00 948214.00 -0.0%
test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 89874.00 89862.00 -0.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 835492.00 835346.00 -0.0%
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 66230.00 66202.00 -0.0%
test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 946090.00 944206.00 -0.2%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1136404.00 1131854.00 -0.4%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1136404.00 1131854.00 -0.4%
gcc-c-torture/execute/GCC-C-execute-pr28982b - better vector code
Olden/health - extra vector code
CINT2017speed/625.x264_s
CINT2017rate/525.x264_r - small variation + improvements in reordering, @pixel_hadamard_ac stopped
being vectorized because of some non-effective shuffle recognition by
the compiler
CINT2017rate/502.gcc_r
CINT2017speed/602.gcc_s - small variations
CFP2017rate/508.namd_r - small variations
CFP2017rate/526.blender_r - small variations
CFP2006/453.povray - extra vector code
Benchmarks/7zip - extra vector code
DOE-ProxyApps-C++/miniFE - small variations
CFP2017rate/511.povray_r - extra vector code
CFP2017speed/638.imagick_s
CFP2017rate/538.imagick_r - extra vector code