Skip to content

Commit c5140b6

Browse files
alexey-bataevKornevNikita
authored andcommitted
[SLP]Fix graph traversal in getSpillCost
getSpill cost relies on def-use order when performs the analysis for the vectorized instructions live-over-calls spills. Patch fixes it to check the dependencies based on TreeEntries and performs actual vectorized type analysis. Reviewers: RKSimon, preames Reviewed By: preames Pull Request: llvm/llvm-project#124984
1 parent 0454ef0 commit c5140b6

File tree

3 files changed

+466
-796
lines changed

3 files changed

+466
-796
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 99 additions & 206 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,7 +1395,7 @@ class BoUpSLP {
13951395

13961396
/// \returns the cost incurred by unwanted spills and fills, caused by
13971397
/// holding live values over call sites.
1398-
InstructionCost getSpillCost() const;
1398+
InstructionCost getSpillCost();
13991399

14001400
/// \returns the vectorization cost of the subtree that starts at \p VL.
14011401
/// A negative number means that this is profitable.
@@ -2958,7 +2958,7 @@ class BoUpSLP {
29582958
}
29592959

29602960
/// Check if the value is vectorized in the tree.
2961-
bool isVectorized(Value *V) const {
2961+
bool isVectorized(const Value *V) const {
29622962
assert(V && "V cannot be nullptr.");
29632963
return ScalarToTreeEntries.contains(V);
29642964
}
@@ -12160,230 +12160,123 @@ bool BoUpSLP::isTreeNotExtendable() const {
1216012160
return Res;
1216112161
}
1216212162

12163-
InstructionCost BoUpSLP::getSpillCost() const {
12163+
InstructionCost BoUpSLP::getSpillCost() {
1216412164
// Walk from the bottom of the tree to the top, tracking which values are
1216512165
// live. When we see a call instruction that is not part of our tree,
1216612166
// query TTI to see if there is a cost to keeping values live over it
1216712167
// (for example, if spills and fills are required).
12168+
InstructionCost Cost = 0;
1216812169

12169-
const TreeEntry *Root = VectorizableTree.front().get();
12170-
if (Root->isGather())
12171-
return 0;
12170+
SmallPtrSet<const TreeEntry *, 4> LiveEntries;
12171+
const TreeEntry *Prev = nullptr;
1217212172

12173-
InstructionCost Cost = 0;
12174-
SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
12175-
EntriesToOperands;
12176-
SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
12177-
SmallPtrSet<const Instruction *, 8> LastInstructions;
12173+
// The entries in VectorizableTree are not necessarily ordered by their
12174+
// position in basic blocks. Collect them and order them by dominance so later
12175+
// instructions are guaranteed to be visited first. For instructions in
12176+
// different basic blocks, we only scan to the beginning of the block, so
12177+
// their order does not matter, as long as all instructions in a basic block
12178+
// are grouped together. Using dominance ensures a deterministic order.
12179+
SmallVector<TreeEntry *, 16> OrderedEntries;
1217812180
for (const auto &TEPtr : VectorizableTree) {
12179-
if (!TEPtr->isGather()) {
12180-
Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
12181-
EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
12182-
LastInstructions.insert(LastInst);
12183-
}
12184-
if (TEPtr->UserTreeIndex)
12185-
EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
12186-
}
12181+
if (TEPtr->isGather())
12182+
continue;
12183+
OrderedEntries.push_back(TEPtr.get());
12184+
}
12185+
llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
12186+
const TreeEntry *TB) {
12187+
Instruction &A = getLastInstructionInBundle(TA);
12188+
Instruction &B = getLastInstructionInBundle(TB);
12189+
auto *NodeA = DT->getNode(A.getParent());
12190+
auto *NodeB = DT->getNode(B.getParent());
12191+
assert(NodeA && "Should only process reachable instructions");
12192+
assert(NodeB && "Should only process reachable instructions");
12193+
assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12194+
"Different nodes should have different DFS numbers");
12195+
if (NodeA != NodeB)
12196+
return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12197+
return B.comesBefore(&A);
12198+
});
1218712199

12188-
auto NoCallIntrinsic = [this](const Instruction *I) {
12189-
const auto *II = dyn_cast<IntrinsicInst>(I);
12190-
if (!II)
12191-
return false;
12192-
if (II->isAssumeLikeIntrinsic())
12193-
return true;
12194-
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12195-
InstructionCost IntrCost =
12196-
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12197-
InstructionCost CallCost = TTI->getCallInstrCost(
12198-
nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
12199-
return IntrCost < CallCost;
12200-
};
12200+
for (const TreeEntry *TE : OrderedEntries) {
12201+
if (!Prev) {
12202+
Prev = TE;
12203+
continue;
12204+
}
1220112205

12202-
// Maps last instruction in the entry to the last instruction for the one of
12203-
// operand entries and the flag. If the flag is true, there are no calls in
12204-
// between these instructions.
12205-
SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
12206-
CheckedInstructions;
12207-
unsigned Budget = 0;
12208-
const unsigned BudgetLimit =
12209-
ScheduleRegionSizeBudget / VectorizableTree.size();
12210-
auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
12211-
const Instruction *Last) {
12212-
assert(First->getParent() == Last->getParent() &&
12213-
"Expected instructions in same block.");
12214-
if (auto It = CheckedInstructions.find(Last);
12215-
It != CheckedInstructions.end()) {
12216-
const Instruction *Checked = It->second.getPointer();
12217-
if (Checked == First || Checked->comesBefore(First))
12218-
return It->second.getInt() != 0;
12219-
Last = Checked;
12220-
} else if (Last == First || Last->comesBefore(First)) {
12221-
return true;
12206+
LiveEntries.erase(Prev);
12207+
for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
12208+
const TreeEntry *Op = getVectorizedOperand(Prev, I);
12209+
if (!Op)
12210+
continue;
12211+
assert(!Op->isGather() && "Expected vectorized operand.");
12212+
LiveEntries.insert(Op);
1222212213
}
12223-
BasicBlock::const_reverse_iterator InstIt =
12224-
++First->getIterator().getReverse(),
12225-
PrevInstIt =
12226-
Last->getIterator().getReverse();
12227-
SmallVector<const Instruction *> LastInstsInRange;
12228-
while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
12214+
12215+
LLVM_DEBUG({
12216+
dbgs() << "SLP: #LV: " << LiveEntries.size();
12217+
for (auto *X : LiveEntries)
12218+
X->dump();
12219+
dbgs() << ", Looking at ";
12220+
TE->dump();
12221+
});
12222+
12223+
// Now find the sequence of instructions between PrevInst and Inst.
12224+
unsigned NumCalls = 0;
12225+
const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
12226+
BasicBlock::const_reverse_iterator
12227+
InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
12228+
PrevInstIt = PrevInst->getIterator().getReverse();
12229+
while (InstIt != PrevInstIt) {
12230+
if (PrevInstIt == PrevInst->getParent()->rend()) {
12231+
PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
12232+
continue;
12233+
}
12234+
12235+
auto NoCallIntrinsic = [this](const Instruction *I) {
12236+
const auto *II = dyn_cast<IntrinsicInst>(I);
12237+
if (!II)
12238+
return false;
12239+
if (II->isAssumeLikeIntrinsic())
12240+
return true;
12241+
FastMathFlags FMF;
12242+
SmallVector<Type *, 4> Tys;
12243+
for (auto &ArgOp : II->args())
12244+
Tys.push_back(ArgOp->getType());
12245+
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12246+
FMF = FPMO->getFastMathFlags();
12247+
IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12248+
FMF);
12249+
InstructionCost IntrCost =
12250+
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12251+
InstructionCost CallCost = TTI->getCallInstrCost(
12252+
nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12253+
return IntrCost < CallCost;
12254+
};
12255+
1222912256
// Debug information does not impact spill cost.
1223012257
// Vectorized calls, represented as vector intrinsics, do not impact spill
1223112258
// cost.
1223212259
if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
12233-
CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
12234-
for (const Instruction *LastInst : LastInstsInRange)
12235-
CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
12236-
return false;
12237-
}
12238-
if (LastInstructions.contains(&*PrevInstIt))
12239-
LastInstsInRange.push_back(&*PrevInstIt);
12260+
CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
12261+
NumCalls++;
1224012262

1224112263
++PrevInstIt;
1224212264
++Budget;
1224312265
}
12244-
for (const Instruction *LastInst : LastInstsInRange)
12245-
CheckedInstructions.try_emplace(
12246-
LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
12247-
Budget <= BudgetLimit ? 1 : 0);
12248-
return Budget <= BudgetLimit;
12249-
};
12250-
auto AddCosts = [&](const TreeEntry *Op) {
12251-
Type *ScalarTy = Op->Scalars.front()->getType();
12252-
auto It = MinBWs.find(Op);
12253-
if (It != MinBWs.end())
12254-
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12255-
auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
12256-
Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
12257-
if (ScalarTy->isVectorTy()) {
12258-
// Handle revec dead vector instructions.
12259-
Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
12260-
}
12261-
};
12262-
// Memoize the relationship between blocks, i.e. if there is (at least one)
12263-
// non-vectorized call between the blocks. This allows to skip the analysis of
12264-
// the same block paths multiple times.
12265-
SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
12266-
ParentOpParentToPreds;
12267-
auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
12268-
BasicBlock *OpParent) {
12269-
auto Key = std::make_pair(Root, OpParent);
12270-
if (auto It = ParentOpParentToPreds.find(Key);
12271-
It != ParentOpParentToPreds.end())
12272-
return It->second;
12273-
SmallVector<BasicBlock *> Worklist;
12274-
if (Pred)
12275-
Worklist.push_back(Pred);
12276-
else
12277-
Worklist.append(pred_begin(Root), pred_end(Root));
12278-
SmallPtrSet<const BasicBlock *, 16> Visited;
12279-
SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
12280-
ParentsPairsToAdd;
12281-
bool Res = false;
12282-
auto Cleanup = make_scope_exit([&]() {
12283-
for (const auto &KeyPair : ParentsPairsToAdd) {
12284-
assert(!ParentOpParentToPreds.contains(KeyPair) &&
12285-
"Should not have been added before.");
12286-
ParentOpParentToPreds.try_emplace(KeyPair, Res);
12287-
}
12288-
});
12289-
while (!Worklist.empty()) {
12290-
BasicBlock *BB = Worklist.pop_back_val();
12291-
if (BB == OpParent || !Visited.insert(BB).second)
12292-
continue;
12293-
auto Pair = std::make_pair(BB, OpParent);
12294-
if (auto It = ParentOpParentToPreds.find(Pair);
12295-
It != ParentOpParentToPreds.end()) {
12296-
Res = It->second;
12297-
return Res;
12298-
}
12299-
ParentsPairsToAdd.insert(Pair);
12300-
unsigned BlockSize = BB->size();
12301-
if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
12302-
return Res;
12303-
Budget += BlockSize;
12304-
if (Budget > BudgetLimit)
12305-
return Res;
12306-
if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
12307-
!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
12308-
BB->getTerminator()))
12309-
return Res;
12310-
Worklist.append(pred_begin(BB), pred_end(BB));
12311-
}
12312-
Res = true;
12313-
return Res;
12314-
};
12315-
SmallVector<const TreeEntry *> LiveEntries(1, Root);
12316-
while (!LiveEntries.empty()) {
12317-
const TreeEntry *Entry = LiveEntries.pop_back_val();
12318-
SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
12319-
if (Operands.empty())
12320-
continue;
12321-
Instruction *LastInst = EntriesToLastInstruction.at(Entry);
12322-
BasicBlock *Parent = LastInst->getParent();
12323-
for (const TreeEntry *Op : Operands) {
12324-
if (!Op->isGather())
12325-
LiveEntries.push_back(Op);
12326-
if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
12327-
(Op->isGather() && allConstant(Op->Scalars)))
12328-
continue;
12329-
Budget = 0;
12330-
BasicBlock *Pred = nullptr;
12331-
if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
12332-
Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12333-
BasicBlock *OpParent;
12334-
Instruction *OpLastInst;
12335-
if (Op->isGather()) {
12336-
assert(Entry->getOpcode() == Instruction::PHI &&
12337-
"Expected phi node only.");
12338-
OpParent = cast<PHINode>(Entry->getMainOp())
12339-
->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12340-
OpLastInst = OpParent->getTerminator();
12341-
for (Value *V : Op->Scalars) {
12342-
auto *Inst = dyn_cast<Instruction>(V);
12343-
if (!Inst)
12344-
continue;
12345-
if (isVectorized(V)) {
12346-
OpParent = Inst->getParent();
12347-
OpLastInst = Inst;
12348-
break;
12349-
}
12350-
}
12351-
} else {
12352-
OpLastInst = EntriesToLastInstruction.at(Op);
12353-
OpParent = OpLastInst->getParent();
12354-
}
12355-
// Check the call instructions within the same basic blocks.
12356-
if (OpParent == Parent) {
12357-
if (Entry->getOpcode() == Instruction::PHI) {
12358-
if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
12359-
AddCosts(Op);
12360-
continue;
12361-
}
12362-
if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
12363-
AddCosts(Op);
12364-
continue;
12365-
}
12366-
// Check for call instruction in between blocks.
12367-
// 1. Check entry's block to the head.
12368-
if (Entry->getOpcode() != Instruction::PHI &&
12369-
!CheckForNonVecCallsInSameBlock(
12370-
&*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
12371-
LastInst)) {
12372-
AddCosts(Op);
12373-
continue;
12374-
}
12375-
// 2. Check op's block from the end.
12376-
if (!CheckForNonVecCallsInSameBlock(OpLastInst,
12377-
OpParent->getTerminator())) {
12378-
AddCosts(Op);
12379-
continue;
12380-
}
12381-
// 3. Check the predecessors of entry's block till op's block.
12382-
if (!CheckPredecessors(Parent, Pred, OpParent)) {
12383-
AddCosts(Op);
12384-
continue;
12266+
12267+
if (NumCalls) {
12268+
SmallVector<Type *, 4> EntriesTypes;
12269+
for (const TreeEntry *TE : LiveEntries) {
12270+
auto *ScalarTy = TE->getMainOp()->getType();
12271+
auto It = MinBWs.find(TE);
12272+
if (It != MinBWs.end())
12273+
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12274+
EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
1238512275
}
12276+
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
1238612277
}
12278+
12279+
Prev = TE;
1238712280
}
1238812281

1238912282
return Cost;

0 commit comments

Comments
 (0)