@@ -1395,7 +1395,7 @@ class BoUpSLP {
1395
1395
1396
1396
/// \returns the cost incurred by unwanted spills and fills, caused by
1397
1397
/// holding live values over call sites.
1398
- InstructionCost getSpillCost() const ;
1398
+ InstructionCost getSpillCost();
1399
1399
1400
1400
/// \returns the vectorization cost of the subtree that starts at \p VL.
1401
1401
/// A negative number means that this is profitable.
@@ -2958,7 +2958,7 @@ class BoUpSLP {
2958
2958
}
2959
2959
2960
2960
/// Check if the value is vectorized in the tree.
2961
- bool isVectorized(Value *V) const {
2961
+ bool isVectorized(const Value *V) const {
2962
2962
assert(V && "V cannot be nullptr.");
2963
2963
return ScalarToTreeEntries.contains(V);
2964
2964
}
@@ -12160,230 +12160,123 @@ bool BoUpSLP::isTreeNotExtendable() const {
12160
12160
return Res;
12161
12161
}
12162
12162
12163
- InstructionCost BoUpSLP::getSpillCost() const {
12163
+ InstructionCost BoUpSLP::getSpillCost() {
12164
12164
// Walk from the bottom of the tree to the top, tracking which values are
12165
12165
// live. When we see a call instruction that is not part of our tree,
12166
12166
// query TTI to see if there is a cost to keeping values live over it
12167
12167
// (for example, if spills and fills are required).
12168
+ InstructionCost Cost = 0;
12168
12169
12169
- const TreeEntry *Root = VectorizableTree.front().get();
12170
- if (Root->isGather())
12171
- return 0;
12170
+ SmallPtrSet<const TreeEntry *, 4> LiveEntries;
12171
+ const TreeEntry *Prev = nullptr;
12172
12172
12173
- InstructionCost Cost = 0;
12174
- SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
12175
- EntriesToOperands;
12176
- SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
12177
- SmallPtrSet<const Instruction *, 8> LastInstructions;
12173
+ // The entries in VectorizableTree are not necessarily ordered by their
12174
+ // position in basic blocks. Collect them and order them by dominance so later
12175
+ // instructions are guaranteed to be visited first. For instructions in
12176
+ // different basic blocks, we only scan to the beginning of the block, so
12177
+ // their order does not matter, as long as all instructions in a basic block
12178
+ // are grouped together. Using dominance ensures a deterministic order.
12179
+ SmallVector<TreeEntry *, 16> OrderedEntries;
12178
12180
for (const auto &TEPtr : VectorizableTree) {
12179
- if (!TEPtr->isGather()) {
12180
- Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
12181
- EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
12182
- LastInstructions.insert(LastInst);
12183
- }
12184
- if (TEPtr->UserTreeIndex)
12185
- EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
12186
- }
12181
+ if (TEPtr->isGather())
12182
+ continue;
12183
+ OrderedEntries.push_back(TEPtr.get());
12184
+ }
12185
+ llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
12186
+ const TreeEntry *TB) {
12187
+ Instruction &A = getLastInstructionInBundle(TA);
12188
+ Instruction &B = getLastInstructionInBundle(TB);
12189
+ auto *NodeA = DT->getNode(A.getParent());
12190
+ auto *NodeB = DT->getNode(B.getParent());
12191
+ assert(NodeA && "Should only process reachable instructions");
12192
+ assert(NodeB && "Should only process reachable instructions");
12193
+ assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12194
+ "Different nodes should have different DFS numbers");
12195
+ if (NodeA != NodeB)
12196
+ return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12197
+ return B.comesBefore(&A);
12198
+ });
12187
12199
12188
- auto NoCallIntrinsic = [this](const Instruction *I) {
12189
- const auto *II = dyn_cast<IntrinsicInst>(I);
12190
- if (!II)
12191
- return false;
12192
- if (II->isAssumeLikeIntrinsic())
12193
- return true;
12194
- IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12195
- InstructionCost IntrCost =
12196
- TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12197
- InstructionCost CallCost = TTI->getCallInstrCost(
12198
- nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
12199
- return IntrCost < CallCost;
12200
- };
12200
+ for (const TreeEntry *TE : OrderedEntries) {
12201
+ if (!Prev) {
12202
+ Prev = TE;
12203
+ continue;
12204
+ }
12201
12205
12202
- // Maps last instruction in the entry to the last instruction for the one of
12203
- // operand entries and the flag. If the flag is true, there are no calls in
12204
- // between these instructions.
12205
- SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
12206
- CheckedInstructions;
12207
- unsigned Budget = 0;
12208
- const unsigned BudgetLimit =
12209
- ScheduleRegionSizeBudget / VectorizableTree.size();
12210
- auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
12211
- const Instruction *Last) {
12212
- assert(First->getParent() == Last->getParent() &&
12213
- "Expected instructions in same block.");
12214
- if (auto It = CheckedInstructions.find(Last);
12215
- It != CheckedInstructions.end()) {
12216
- const Instruction *Checked = It->second.getPointer();
12217
- if (Checked == First || Checked->comesBefore(First))
12218
- return It->second.getInt() != 0;
12219
- Last = Checked;
12220
- } else if (Last == First || Last->comesBefore(First)) {
12221
- return true;
12206
+ LiveEntries.erase(Prev);
12207
+ for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
12208
+ const TreeEntry *Op = getVectorizedOperand(Prev, I);
12209
+ if (!Op)
12210
+ continue;
12211
+ assert(!Op->isGather() && "Expected vectorized operand.");
12212
+ LiveEntries.insert(Op);
12222
12213
}
12223
- BasicBlock::const_reverse_iterator InstIt =
12224
- ++First->getIterator().getReverse(),
12225
- PrevInstIt =
12226
- Last->getIterator().getReverse();
12227
- SmallVector<const Instruction *> LastInstsInRange;
12228
- while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
12214
+
12215
+ LLVM_DEBUG({
12216
+ dbgs() << "SLP: #LV: " << LiveEntries.size();
12217
+ for (auto *X : LiveEntries)
12218
+ X->dump();
12219
+ dbgs() << ", Looking at ";
12220
+ TE->dump();
12221
+ });
12222
+
12223
+ // Now find the sequence of instructions between PrevInst and Inst.
12224
+ unsigned NumCalls = 0;
12225
+ const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
12226
+ BasicBlock::const_reverse_iterator
12227
+ InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
12228
+ PrevInstIt = PrevInst->getIterator().getReverse();
12229
+ while (InstIt != PrevInstIt) {
12230
+ if (PrevInstIt == PrevInst->getParent()->rend()) {
12231
+ PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
12232
+ continue;
12233
+ }
12234
+
12235
+ auto NoCallIntrinsic = [this](const Instruction *I) {
12236
+ const auto *II = dyn_cast<IntrinsicInst>(I);
12237
+ if (!II)
12238
+ return false;
12239
+ if (II->isAssumeLikeIntrinsic())
12240
+ return true;
12241
+ FastMathFlags FMF;
12242
+ SmallVector<Type *, 4> Tys;
12243
+ for (auto &ArgOp : II->args())
12244
+ Tys.push_back(ArgOp->getType());
12245
+ if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12246
+ FMF = FPMO->getFastMathFlags();
12247
+ IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12248
+ FMF);
12249
+ InstructionCost IntrCost =
12250
+ TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12251
+ InstructionCost CallCost = TTI->getCallInstrCost(
12252
+ nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12253
+ return IntrCost < CallCost;
12254
+ };
12255
+
12229
12256
// Debug information does not impact spill cost.
12230
12257
// Vectorized calls, represented as vector intrinsics, do not impact spill
12231
12258
// cost.
12232
12259
if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
12233
- CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
12234
- for (const Instruction *LastInst : LastInstsInRange)
12235
- CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
12236
- return false;
12237
- }
12238
- if (LastInstructions.contains(&*PrevInstIt))
12239
- LastInstsInRange.push_back(&*PrevInstIt);
12260
+ CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
12261
+ NumCalls++;
12240
12262
12241
12263
++PrevInstIt;
12242
12264
++Budget;
12243
12265
}
12244
- for (const Instruction *LastInst : LastInstsInRange)
12245
- CheckedInstructions.try_emplace(
12246
- LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
12247
- Budget <= BudgetLimit ? 1 : 0);
12248
- return Budget <= BudgetLimit;
12249
- };
12250
- auto AddCosts = [&](const TreeEntry *Op) {
12251
- Type *ScalarTy = Op->Scalars.front()->getType();
12252
- auto It = MinBWs.find(Op);
12253
- if (It != MinBWs.end())
12254
- ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12255
- auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
12256
- Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
12257
- if (ScalarTy->isVectorTy()) {
12258
- // Handle revec dead vector instructions.
12259
- Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
12260
- }
12261
- };
12262
- // Memoize the relationship between blocks, i.e. if there is (at least one)
12263
- // non-vectorized call between the blocks. This allows to skip the analysis of
12264
- // the same block paths multiple times.
12265
- SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
12266
- ParentOpParentToPreds;
12267
- auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
12268
- BasicBlock *OpParent) {
12269
- auto Key = std::make_pair(Root, OpParent);
12270
- if (auto It = ParentOpParentToPreds.find(Key);
12271
- It != ParentOpParentToPreds.end())
12272
- return It->second;
12273
- SmallVector<BasicBlock *> Worklist;
12274
- if (Pred)
12275
- Worklist.push_back(Pred);
12276
- else
12277
- Worklist.append(pred_begin(Root), pred_end(Root));
12278
- SmallPtrSet<const BasicBlock *, 16> Visited;
12279
- SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
12280
- ParentsPairsToAdd;
12281
- bool Res = false;
12282
- auto Cleanup = make_scope_exit([&]() {
12283
- for (const auto &KeyPair : ParentsPairsToAdd) {
12284
- assert(!ParentOpParentToPreds.contains(KeyPair) &&
12285
- "Should not have been added before.");
12286
- ParentOpParentToPreds.try_emplace(KeyPair, Res);
12287
- }
12288
- });
12289
- while (!Worklist.empty()) {
12290
- BasicBlock *BB = Worklist.pop_back_val();
12291
- if (BB == OpParent || !Visited.insert(BB).second)
12292
- continue;
12293
- auto Pair = std::make_pair(BB, OpParent);
12294
- if (auto It = ParentOpParentToPreds.find(Pair);
12295
- It != ParentOpParentToPreds.end()) {
12296
- Res = It->second;
12297
- return Res;
12298
- }
12299
- ParentsPairsToAdd.insert(Pair);
12300
- unsigned BlockSize = BB->size();
12301
- if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
12302
- return Res;
12303
- Budget += BlockSize;
12304
- if (Budget > BudgetLimit)
12305
- return Res;
12306
- if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
12307
- !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
12308
- BB->getTerminator()))
12309
- return Res;
12310
- Worklist.append(pred_begin(BB), pred_end(BB));
12311
- }
12312
- Res = true;
12313
- return Res;
12314
- };
12315
- SmallVector<const TreeEntry *> LiveEntries(1, Root);
12316
- while (!LiveEntries.empty()) {
12317
- const TreeEntry *Entry = LiveEntries.pop_back_val();
12318
- SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
12319
- if (Operands.empty())
12320
- continue;
12321
- Instruction *LastInst = EntriesToLastInstruction.at(Entry);
12322
- BasicBlock *Parent = LastInst->getParent();
12323
- for (const TreeEntry *Op : Operands) {
12324
- if (!Op->isGather())
12325
- LiveEntries.push_back(Op);
12326
- if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
12327
- (Op->isGather() && allConstant(Op->Scalars)))
12328
- continue;
12329
- Budget = 0;
12330
- BasicBlock *Pred = nullptr;
12331
- if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
12332
- Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12333
- BasicBlock *OpParent;
12334
- Instruction *OpLastInst;
12335
- if (Op->isGather()) {
12336
- assert(Entry->getOpcode() == Instruction::PHI &&
12337
- "Expected phi node only.");
12338
- OpParent = cast<PHINode>(Entry->getMainOp())
12339
- ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12340
- OpLastInst = OpParent->getTerminator();
12341
- for (Value *V : Op->Scalars) {
12342
- auto *Inst = dyn_cast<Instruction>(V);
12343
- if (!Inst)
12344
- continue;
12345
- if (isVectorized(V)) {
12346
- OpParent = Inst->getParent();
12347
- OpLastInst = Inst;
12348
- break;
12349
- }
12350
- }
12351
- } else {
12352
- OpLastInst = EntriesToLastInstruction.at(Op);
12353
- OpParent = OpLastInst->getParent();
12354
- }
12355
- // Check the call instructions within the same basic blocks.
12356
- if (OpParent == Parent) {
12357
- if (Entry->getOpcode() == Instruction::PHI) {
12358
- if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
12359
- AddCosts(Op);
12360
- continue;
12361
- }
12362
- if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
12363
- AddCosts(Op);
12364
- continue;
12365
- }
12366
- // Check for call instruction in between blocks.
12367
- // 1. Check entry's block to the head.
12368
- if (Entry->getOpcode() != Instruction::PHI &&
12369
- !CheckForNonVecCallsInSameBlock(
12370
- &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
12371
- LastInst)) {
12372
- AddCosts(Op);
12373
- continue;
12374
- }
12375
- // 2. Check op's block from the end.
12376
- if (!CheckForNonVecCallsInSameBlock(OpLastInst,
12377
- OpParent->getTerminator())) {
12378
- AddCosts(Op);
12379
- continue;
12380
- }
12381
- // 3. Check the predecessors of entry's block till op's block.
12382
- if (!CheckPredecessors(Parent, Pred, OpParent)) {
12383
- AddCosts(Op);
12384
- continue;
12266
+
12267
+ if (NumCalls) {
12268
+ SmallVector<Type *, 4> EntriesTypes;
12269
+ for (const TreeEntry *TE : LiveEntries) {
12270
+ auto *ScalarTy = TE->getMainOp()->getType();
12271
+ auto It = MinBWs.find(TE);
12272
+ if (It != MinBWs.end())
12273
+ ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12274
+ EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
12385
12275
}
12276
+ Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
12386
12277
}
12278
+
12279
+ Prev = TE;
12387
12280
}
12388
12281
12389
12282
return Cost;
0 commit comments