Skip to content

Commit 4803d32

Browse files
alexey-bataevKornevNikita
authored andcommitted
[SLP]Fix/improve getSpillCost analysis
Previous implementation may took some extra time, when walked over the same instructions several times. And also it did not include proper analysis for cross-basic-block use of the vectorized values. This version fixes it. It walks over the tree and checks the deps between entries and their operands. If there are non-vectorized calls in between, it adds a single(!) spill cost, because the vector value should be spilled/reloaded only once. Also, this version caches analysis for each entries, which are detected, and do not repeats it, uses data, found during previous analysis for previous nodes. Also, it has the internal limit. If the number of instructions between nodes and their operands is too big (> than ScheduleRegionSizeBudget / VectorizableTree.size()), it is considered that the spill is required. It allows to improve compile time. Reviewers: preames, RKSimon, mikhailramalho Reviewed By: preames Pull Request: llvm/llvm-project#129258
1 parent 245d31d commit 4803d32

File tree

3 files changed

+243
-106
lines changed

3 files changed

+243
-106
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 217 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -12165,112 +12165,224 @@ InstructionCost BoUpSLP::getSpillCost() const {
1216512165
// live. When we see a call instruction that is not part of our tree,
1216612166
// query TTI to see if there is a cost to keeping values live over it
1216712167
// (for example, if spills and fills are required).
12168-
unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12169-
InstructionCost Cost = 0;
1217012168

12171-
SmallPtrSet<Instruction *, 4> LiveValues;
12172-
Instruction *PrevInst = nullptr;
12169+
const TreeEntry *Root = VectorizableTree.front().get();
12170+
if (Root->isGather())
12171+
return 0;
1217312172

12174-
// The entries in VectorizableTree are not necessarily ordered by their
12175-
// position in basic blocks. Collect them and order them by dominance so later
12176-
// instructions are guaranteed to be visited first. For instructions in
12177-
// different basic blocks, we only scan to the beginning of the block, so
12178-
// their order does not matter, as long as all instructions in a basic block
12179-
// are grouped together. Using dominance ensures a deterministic order.
12180-
SmallVector<Instruction *, 16> OrderedScalars;
12173+
InstructionCost Cost = 0;
12174+
SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
12175+
EntriesToOperands;
12176+
SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
12177+
SmallPtrSet<const Instruction *, 8> LastInstructions;
1218112178
for (const auto &TEPtr : VectorizableTree) {
12182-
if (TEPtr->State != TreeEntry::Vectorize)
12183-
continue;
12184-
Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12185-
if (!Inst)
12186-
continue;
12187-
OrderedScalars.push_back(Inst);
12188-
}
12189-
llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12190-
auto *NodeA = DT->getNode(A->getParent());
12191-
auto *NodeB = DT->getNode(B->getParent());
12192-
assert(NodeA && "Should only process reachable instructions");
12193-
assert(NodeB && "Should only process reachable instructions");
12194-
assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12195-
"Different nodes should have different DFS numbers");
12196-
if (NodeA != NodeB)
12197-
return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12198-
return B->comesBefore(A);
12199-
});
12200-
12201-
for (Instruction *Inst : OrderedScalars) {
12202-
if (!PrevInst) {
12203-
PrevInst = Inst;
12204-
continue;
12179+
if (!TEPtr->isGather()) {
12180+
Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
12181+
EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
12182+
LastInstructions.insert(LastInst);
1220512183
}
12184+
if (TEPtr->UserTreeIndex)
12185+
EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
12186+
}
1220612187

12207-
// Update LiveValues.
12208-
LiveValues.erase(PrevInst);
12209-
for (auto &J : PrevInst->operands()) {
12210-
if (isa<Instruction>(&*J) && isVectorized(&*J))
12211-
LiveValues.insert(cast<Instruction>(&*J));
12188+
auto NoCallIntrinsic = [this](const Instruction *I) {
12189+
const auto *II = dyn_cast<IntrinsicInst>(I);
12190+
if (!II)
12191+
return false;
12192+
if (II->isAssumeLikeIntrinsic())
12193+
return true;
12194+
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12195+
InstructionCost IntrCost =
12196+
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12197+
InstructionCost CallCost = TTI->getCallInstrCost(
12198+
nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
12199+
return IntrCost < CallCost;
12200+
};
12201+
12202+
// Maps last instruction in the entry to the last instruction for the one of
12203+
// operand entries and the flag. If the flag is true, there are no calls in
12204+
// between these instructions.
12205+
SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
12206+
CheckedInstructions;
12207+
unsigned Budget = 0;
12208+
const unsigned BudgetLimit =
12209+
ScheduleRegionSizeBudget / VectorizableTree.size();
12210+
auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
12211+
const Instruction *Last) {
12212+
assert(First->getParent() == Last->getParent() &&
12213+
"Expected instructions in same block.");
12214+
if (auto It = CheckedInstructions.find(Last);
12215+
It != CheckedInstructions.end()) {
12216+
const Instruction *Checked = It->second.getPointer();
12217+
if (Checked == First || Checked->comesBefore(First))
12218+
return It->second.getInt() != 0;
12219+
Last = Checked;
12220+
} else if (Last == First || Last->comesBefore(First)) {
12221+
return true;
1221212222
}
12223+
BasicBlock::const_reverse_iterator InstIt =
12224+
++First->getIterator().getReverse(),
12225+
PrevInstIt =
12226+
Last->getIterator().getReverse();
12227+
SmallVector<const Instruction *> LastInstsInRange;
12228+
while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
12229+
// Debug information does not impact spill cost.
12230+
// Vectorized calls, represented as vector intrinsics, do not impact spill
12231+
// cost.
12232+
if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
12233+
CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
12234+
for (const Instruction *LastInst : LastInstsInRange)
12235+
CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
12236+
return false;
12237+
}
12238+
if (LastInstructions.contains(&*PrevInstIt))
12239+
LastInstsInRange.push_back(&*PrevInstIt);
1221312240

12214-
LLVM_DEBUG({
12215-
dbgs() << "SLP: #LV: " << LiveValues.size();
12216-
for (auto *X : LiveValues)
12217-
dbgs() << " " << X->getName();
12218-
dbgs() << ", Looking at ";
12219-
Inst->dump();
12241+
++PrevInstIt;
12242+
++Budget;
12243+
}
12244+
for (const Instruction *LastInst : LastInstsInRange)
12245+
CheckedInstructions.try_emplace(
12246+
LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
12247+
Budget <= BudgetLimit ? 1 : 0);
12248+
return Budget <= BudgetLimit;
12249+
};
12250+
auto AddCosts = [&](const TreeEntry *Op) {
12251+
Type *ScalarTy = Op->Scalars.front()->getType();
12252+
auto It = MinBWs.find(Op);
12253+
if (It != MinBWs.end())
12254+
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12255+
auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
12256+
Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
12257+
if (ScalarTy->isVectorTy()) {
12258+
// Handle revec dead vector instructions.
12259+
Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
12260+
}
12261+
};
12262+
// Memoize the relationship between blocks, i.e. if there is (at least one)
12263+
// non-vectorized call between the blocks. This allows to skip the analysis of
12264+
// the same block paths multiple times.
12265+
SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
12266+
ParentOpParentToPreds;
12267+
auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
12268+
BasicBlock *OpParent) {
12269+
auto Key = std::make_pair(Root, OpParent);
12270+
if (auto It = ParentOpParentToPreds.find(Key);
12271+
It != ParentOpParentToPreds.end())
12272+
return It->second;
12273+
SmallVector<BasicBlock *> Worklist;
12274+
if (Pred)
12275+
Worklist.push_back(Pred);
12276+
else
12277+
Worklist.append(pred_begin(Root), pred_end(Root));
12278+
SmallPtrSet<const BasicBlock *, 16> Visited;
12279+
SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
12280+
ParentsPairsToAdd;
12281+
bool Res = false;
12282+
auto Cleanup = make_scope_exit([&]() {
12283+
for (const auto &KeyPair : ParentsPairsToAdd) {
12284+
assert(!ParentOpParentToPreds.contains(KeyPair) &&
12285+
"Should not have been added before.");
12286+
ParentOpParentToPreds.try_emplace(KeyPair, Res);
12287+
}
1222012288
});
12221-
12222-
// Now find the sequence of instructions between PrevInst and Inst.
12223-
unsigned NumCalls = 0;
12224-
BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12225-
PrevInstIt =
12226-
PrevInst->getIterator().getReverse();
12227-
while (InstIt != PrevInstIt) {
12228-
if (PrevInstIt == PrevInst->getParent()->rend()) {
12229-
PrevInstIt = Inst->getParent()->rbegin();
12289+
while (!Worklist.empty()) {
12290+
BasicBlock *BB = Worklist.pop_back_val();
12291+
if (BB == OpParent || !Visited.insert(BB).second)
1223012292
continue;
12293+
auto Pair = std::make_pair(BB, OpParent);
12294+
if (auto It = ParentOpParentToPreds.find(Pair);
12295+
It != ParentOpParentToPreds.end()) {
12296+
Res = It->second;
12297+
return Res;
1223112298
}
12232-
12233-
auto NoCallIntrinsic = [this](Instruction *I) {
12234-
auto *II = dyn_cast<IntrinsicInst>(I);
12235-
if (!II)
12236-
return false;
12237-
if (II->isAssumeLikeIntrinsic())
12238-
return true;
12239-
FastMathFlags FMF;
12240-
SmallVector<Type *, 4> Tys;
12241-
for (auto &ArgOp : II->args())
12242-
Tys.push_back(ArgOp->getType());
12243-
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12244-
FMF = FPMO->getFastMathFlags();
12245-
IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12246-
FMF);
12247-
InstructionCost IntrCost =
12248-
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12249-
InstructionCost CallCost = TTI->getCallInstrCost(
12250-
nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12251-
return IntrCost < CallCost;
12252-
};
12253-
12254-
// Debug information does not impact spill cost.
12255-
if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12256-
&*PrevInstIt != PrevInst)
12257-
NumCalls++;
12258-
12259-
++PrevInstIt;
12299+
ParentsPairsToAdd.insert(Pair);
12300+
unsigned BlockSize = BB->size();
12301+
if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
12302+
return Res;
12303+
Budget += BlockSize;
12304+
if (Budget > BudgetLimit)
12305+
return Res;
12306+
if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
12307+
BB->getTerminator()))
12308+
return Res;
12309+
Worklist.append(pred_begin(BB), pred_end(BB));
1226012310
}
12261-
12262-
if (NumCalls) {
12263-
SmallVector<Type *, 4> V;
12264-
for (auto *II : LiveValues) {
12265-
auto *ScalarTy = II->getType();
12266-
if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12267-
ScalarTy = VectorTy->getElementType();
12268-
V.push_back(getWidenedType(ScalarTy, BundleWidth));
12311+
Res = true;
12312+
return Res;
12313+
};
12314+
SmallVector<const TreeEntry *> LiveEntries(1, Root);
12315+
while (!LiveEntries.empty()) {
12316+
const TreeEntry *Entry = LiveEntries.pop_back_val();
12317+
SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
12318+
if (Operands.empty())
12319+
continue;
12320+
Instruction *LastInst = EntriesToLastInstruction.at(Entry);
12321+
BasicBlock *Parent = LastInst->getParent();
12322+
for (const TreeEntry *Op : Operands) {
12323+
if (!Op->isGather())
12324+
LiveEntries.push_back(Op);
12325+
if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
12326+
(Op->isGather() && allConstant(Op->Scalars)))
12327+
continue;
12328+
Budget = 0;
12329+
BasicBlock *Pred = nullptr;
12330+
if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
12331+
Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12332+
BasicBlock *OpParent;
12333+
Instruction *OpLastInst;
12334+
if (Op->isGather()) {
12335+
assert(Entry->getOpcode() == Instruction::PHI &&
12336+
"Expected phi node only.");
12337+
OpParent = cast<PHINode>(Entry->getMainOp())
12338+
->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12339+
OpLastInst = OpParent->getTerminator();
12340+
for (Value *V : Op->Scalars) {
12341+
auto *Inst = dyn_cast<Instruction>(V);
12342+
if (!Inst)
12343+
continue;
12344+
if (isVectorized(V)) {
12345+
OpParent = Inst->getParent();
12346+
OpLastInst = Inst;
12347+
break;
12348+
}
12349+
}
12350+
} else {
12351+
OpLastInst = EntriesToLastInstruction.at(Op);
12352+
OpParent = OpLastInst->getParent();
12353+
}
12354+
// Check the call instructions within the same basic blocks.
12355+
if (OpParent == Parent) {
12356+
if (Entry->getOpcode() == Instruction::PHI) {
12357+
if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
12358+
AddCosts(Op);
12359+
continue;
12360+
}
12361+
if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
12362+
AddCosts(Op);
12363+
continue;
12364+
}
12365+
// Check for call instruction in between blocks.
12366+
// 1. Check entry's block to the head.
12367+
if (Entry->getOpcode() != Instruction::PHI &&
12368+
!CheckForNonVecCallsInSameBlock(
12369+
&*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
12370+
LastInst)) {
12371+
AddCosts(Op);
12372+
continue;
12373+
}
12374+
// 2. Check op's block from the end.
12375+
if (!CheckForNonVecCallsInSameBlock(OpLastInst,
12376+
OpParent->getTerminator())) {
12377+
AddCosts(Op);
12378+
continue;
12379+
}
12380+
// 3. Check the predecessors of entry's block till op's block.
12381+
if (!CheckPredecessors(Parent, Pred, OpParent)) {
12382+
AddCosts(Op);
12383+
continue;
1226912384
}
12270-
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
1227112385
}
12272-
12273-
PrevInst = Inst;
1227412386
}
1227512387

1227612388
return Cost;
@@ -12778,8 +12890,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1277812890
}
1277912891
}
1278012892

12781-
InstructionCost SpillCost = getSpillCost();
12782-
Cost += SpillCost + ExtractCost;
12893+
Cost += ExtractCost;
1278312894
auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
1278412895
bool) {
1278512896
InstructionCost C = 0;
@@ -12918,12 +13029,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1291813029
}
1291913030
}
1292013031

13032+
std::optional<InstructionCost> SpillCost;
13033+
if (Cost < -SLPCostThreshold) {
13034+
SpillCost = getSpillCost();
13035+
Cost += *SpillCost;
13036+
}
1292113037
#ifndef NDEBUG
1292213038
SmallString<256> Str;
1292313039
{
1292413040
raw_svector_ostream OS(Str);
12925-
OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12926-
<< "SLP: Extract Cost = " << ExtractCost << ".\n"
13041+
OS << "SLP: Spill Cost = ";
13042+
if (SpillCost)
13043+
OS << *SpillCost;
13044+
else
13045+
OS << "<skipped>";
13046+
OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
1292713047
<< "SLP: Total Cost = " << Cost << ".\n";
1292813048
}
1292913049
LLVM_DEBUG(dbgs() << Str);

llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1740,7 +1740,9 @@ entry:
17401740
define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
17411741
; CHECK-LABEL: define void @f
17421742
; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
1743-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
1743+
; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
1744+
; CHECK-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
1745+
; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
17441746
; CHECK-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
17451747
; CHECK: foo:
17461748
; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1751,12 +1753,16 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
17511753
; CHECK-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
17521754
; CHECK-NEXT: br label [[BAZ]]
17531755
; CHECK: baz:
1754-
; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
1756+
; CHECK-NEXT: store i64 [[X0]], ptr [[Q]], align 8
1757+
; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
1758+
; CHECK-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
17551759
; CHECK-NEXT: ret void
17561760
;
17571761
; DEFAULT-LABEL: define void @f
17581762
; DEFAULT-SAME: (i1 [[C:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]]) #[[ATTR1]] {
1759-
; DEFAULT-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 8
1763+
; DEFAULT-NEXT: [[X0:%.*]] = load i64, ptr [[P]], align 8
1764+
; DEFAULT-NEXT: [[P1:%.*]] = getelementptr i64, ptr [[P]], i64 1
1765+
; DEFAULT-NEXT: [[X1:%.*]] = load i64, ptr [[P1]], align 8
17601766
; DEFAULT-NEXT: br i1 [[C]], label [[FOO:%.*]], label [[BAR:%.*]]
17611767
; DEFAULT: foo:
17621768
; DEFAULT-NEXT: [[Y0:%.*]] = load float, ptr [[R]], align 4
@@ -1767,7 +1773,9 @@ define void @f(i1 %c, ptr %p, ptr %q, ptr %r) {
17671773
; DEFAULT-NEXT: [[Z1:%.*]] = call float @fabsf(float [[Z0]])
17681774
; DEFAULT-NEXT: br label [[BAZ]]
17691775
; DEFAULT: baz:
1770-
; DEFAULT-NEXT: store <2 x i64> [[TMP1]], ptr [[Q]], align 8
1776+
; DEFAULT-NEXT: store i64 [[X0]], ptr [[Q]], align 8
1777+
; DEFAULT-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[Q]], i64 1
1778+
; DEFAULT-NEXT: store i64 [[X1]], ptr [[Q1]], align 8
17711779
; DEFAULT-NEXT: ret void
17721780
;
17731781
%x0 = load i64, ptr %p

0 commit comments

Comments
 (0)