@@ -12165,112 +12165,224 @@ InstructionCost BoUpSLP::getSpillCost() const {
12165
12165
// live. When we see a call instruction that is not part of our tree,
12166
12166
// query TTI to see if there is a cost to keeping values live over it
12167
12167
// (for example, if spills and fills are required).
12168
- unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12169
- InstructionCost Cost = 0;
12170
12168
12171
- SmallPtrSet<Instruction *, 4> LiveValues;
12172
- Instruction *PrevInst = nullptr;
12169
+ const TreeEntry *Root = VectorizableTree.front().get();
12170
+ if (Root->isGather())
12171
+ return 0;
12173
12172
12174
- // The entries in VectorizableTree are not necessarily ordered by their
12175
- // position in basic blocks. Collect them and order them by dominance so later
12176
- // instructions are guaranteed to be visited first. For instructions in
12177
- // different basic blocks, we only scan to the beginning of the block, so
12178
- // their order does not matter, as long as all instructions in a basic block
12179
- // are grouped together. Using dominance ensures a deterministic order.
12180
- SmallVector<Instruction *, 16> OrderedScalars;
12173
+ InstructionCost Cost = 0;
12174
+ SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
12175
+ EntriesToOperands;
12176
+ SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
12177
+ SmallPtrSet<const Instruction *, 8> LastInstructions;
12181
12178
for (const auto &TEPtr : VectorizableTree) {
12182
- if (TEPtr->State != TreeEntry::Vectorize)
12183
- continue;
12184
- Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12185
- if (!Inst)
12186
- continue;
12187
- OrderedScalars.push_back(Inst);
12188
- }
12189
- llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12190
- auto *NodeA = DT->getNode(A->getParent());
12191
- auto *NodeB = DT->getNode(B->getParent());
12192
- assert(NodeA && "Should only process reachable instructions");
12193
- assert(NodeB && "Should only process reachable instructions");
12194
- assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12195
- "Different nodes should have different DFS numbers");
12196
- if (NodeA != NodeB)
12197
- return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12198
- return B->comesBefore(A);
12199
- });
12200
-
12201
- for (Instruction *Inst : OrderedScalars) {
12202
- if (!PrevInst) {
12203
- PrevInst = Inst;
12204
- continue;
12179
+ if (!TEPtr->isGather()) {
12180
+ Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
12181
+ EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
12182
+ LastInstructions.insert(LastInst);
12205
12183
}
12184
+ if (TEPtr->UserTreeIndex)
12185
+ EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
12186
+ }
12206
12187
12207
- // Update LiveValues.
12208
- LiveValues.erase(PrevInst);
12209
- for (auto &J : PrevInst->operands()) {
12210
- if (isa<Instruction>(&*J) && isVectorized(&*J))
12211
- LiveValues.insert(cast<Instruction>(&*J));
12188
+ auto NoCallIntrinsic = [this](const Instruction *I) {
12189
+ const auto *II = dyn_cast<IntrinsicInst>(I);
12190
+ if (!II)
12191
+ return false;
12192
+ if (II->isAssumeLikeIntrinsic())
12193
+ return true;
12194
+ IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12195
+ InstructionCost IntrCost =
12196
+ TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12197
+ InstructionCost CallCost = TTI->getCallInstrCost(
12198
+ nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
12199
+ return IntrCost < CallCost;
12200
+ };
12201
+
12202
+ // Maps last instruction in the entry to the last instruction for the one of
12203
+ // operand entries and the flag. If the flag is true, there are no calls in
12204
+ // between these instructions.
12205
+ SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
12206
+ CheckedInstructions;
12207
+ unsigned Budget = 0;
12208
+ const unsigned BudgetLimit =
12209
+ ScheduleRegionSizeBudget / VectorizableTree.size();
12210
+ auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
12211
+ const Instruction *Last) {
12212
+ assert(First->getParent() == Last->getParent() &&
12213
+ "Expected instructions in same block.");
12214
+ if (auto It = CheckedInstructions.find(Last);
12215
+ It != CheckedInstructions.end()) {
12216
+ const Instruction *Checked = It->second.getPointer();
12217
+ if (Checked == First || Checked->comesBefore(First))
12218
+ return It->second.getInt() != 0;
12219
+ Last = Checked;
12220
+ } else if (Last == First || Last->comesBefore(First)) {
12221
+ return true;
12212
12222
}
12223
+ BasicBlock::const_reverse_iterator InstIt =
12224
+ ++First->getIterator().getReverse(),
12225
+ PrevInstIt =
12226
+ Last->getIterator().getReverse();
12227
+ SmallVector<const Instruction *> LastInstsInRange;
12228
+ while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
12229
+ // Debug information does not impact spill cost.
12230
+ // Vectorized calls, represented as vector intrinsics, do not impact spill
12231
+ // cost.
12232
+ if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
12233
+ CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
12234
+ for (const Instruction *LastInst : LastInstsInRange)
12235
+ CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
12236
+ return false;
12237
+ }
12238
+ if (LastInstructions.contains(&*PrevInstIt))
12239
+ LastInstsInRange.push_back(&*PrevInstIt);
12213
12240
12214
- LLVM_DEBUG({
12215
- dbgs() << "SLP: #LV: " << LiveValues.size();
12216
- for (auto *X : LiveValues)
12217
- dbgs() << " " << X->getName();
12218
- dbgs() << ", Looking at ";
12219
- Inst->dump();
12241
+ ++PrevInstIt;
12242
+ ++Budget;
12243
+ }
12244
+ for (const Instruction *LastInst : LastInstsInRange)
12245
+ CheckedInstructions.try_emplace(
12246
+ LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
12247
+ Budget <= BudgetLimit ? 1 : 0);
12248
+ return Budget <= BudgetLimit;
12249
+ };
12250
+ auto AddCosts = [&](const TreeEntry *Op) {
12251
+ Type *ScalarTy = Op->Scalars.front()->getType();
12252
+ auto It = MinBWs.find(Op);
12253
+ if (It != MinBWs.end())
12254
+ ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12255
+ auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
12256
+ Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
12257
+ if (ScalarTy->isVectorTy()) {
12258
+ // Handle revec dead vector instructions.
12259
+ Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
12260
+ }
12261
+ };
12262
+ // Memoize the relationship between blocks, i.e. if there is (at least one)
12263
+ // non-vectorized call between the blocks. This allows to skip the analysis of
12264
+ // the same block paths multiple times.
12265
+ SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
12266
+ ParentOpParentToPreds;
12267
+ auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
12268
+ BasicBlock *OpParent) {
12269
+ auto Key = std::make_pair(Root, OpParent);
12270
+ if (auto It = ParentOpParentToPreds.find(Key);
12271
+ It != ParentOpParentToPreds.end())
12272
+ return It->second;
12273
+ SmallVector<BasicBlock *> Worklist;
12274
+ if (Pred)
12275
+ Worklist.push_back(Pred);
12276
+ else
12277
+ Worklist.append(pred_begin(Root), pred_end(Root));
12278
+ SmallPtrSet<const BasicBlock *, 16> Visited;
12279
+ SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
12280
+ ParentsPairsToAdd;
12281
+ bool Res = false;
12282
+ auto Cleanup = make_scope_exit([&]() {
12283
+ for (const auto &KeyPair : ParentsPairsToAdd) {
12284
+ assert(!ParentOpParentToPreds.contains(KeyPair) &&
12285
+ "Should not have been added before.");
12286
+ ParentOpParentToPreds.try_emplace(KeyPair, Res);
12287
+ }
12220
12288
});
12221
-
12222
- // Now find the sequence of instructions between PrevInst and Inst.
12223
- unsigned NumCalls = 0;
12224
- BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12225
- PrevInstIt =
12226
- PrevInst->getIterator().getReverse();
12227
- while (InstIt != PrevInstIt) {
12228
- if (PrevInstIt == PrevInst->getParent()->rend()) {
12229
- PrevInstIt = Inst->getParent()->rbegin();
12289
+ while (!Worklist.empty()) {
12290
+ BasicBlock *BB = Worklist.pop_back_val();
12291
+ if (BB == OpParent || !Visited.insert(BB).second)
12230
12292
continue;
12293
+ auto Pair = std::make_pair(BB, OpParent);
12294
+ if (auto It = ParentOpParentToPreds.find(Pair);
12295
+ It != ParentOpParentToPreds.end()) {
12296
+ Res = It->second;
12297
+ return Res;
12231
12298
}
12232
-
12233
- auto NoCallIntrinsic = [this](Instruction *I) {
12234
- auto *II = dyn_cast<IntrinsicInst>(I);
12235
- if (!II)
12236
- return false;
12237
- if (II->isAssumeLikeIntrinsic())
12238
- return true;
12239
- FastMathFlags FMF;
12240
- SmallVector<Type *, 4> Tys;
12241
- for (auto &ArgOp : II->args())
12242
- Tys.push_back(ArgOp->getType());
12243
- if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12244
- FMF = FPMO->getFastMathFlags();
12245
- IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12246
- FMF);
12247
- InstructionCost IntrCost =
12248
- TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12249
- InstructionCost CallCost = TTI->getCallInstrCost(
12250
- nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12251
- return IntrCost < CallCost;
12252
- };
12253
-
12254
- // Debug information does not impact spill cost.
12255
- if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12256
- &*PrevInstIt != PrevInst)
12257
- NumCalls++;
12258
-
12259
- ++PrevInstIt;
12299
+ ParentsPairsToAdd.insert(Pair);
12300
+ unsigned BlockSize = BB->size();
12301
+ if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
12302
+ return Res;
12303
+ Budget += BlockSize;
12304
+ if (Budget > BudgetLimit)
12305
+ return Res;
12306
+ if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
12307
+ BB->getTerminator()))
12308
+ return Res;
12309
+ Worklist.append(pred_begin(BB), pred_end(BB));
12260
12310
}
12261
-
12262
- if (NumCalls) {
12263
- SmallVector<Type *, 4> V;
12264
- for (auto *II : LiveValues) {
12265
- auto *ScalarTy = II->getType();
12266
- if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12267
- ScalarTy = VectorTy->getElementType();
12268
- V.push_back(getWidenedType(ScalarTy, BundleWidth));
12311
+ Res = true;
12312
+ return Res;
12313
+ };
12314
+ SmallVector<const TreeEntry *> LiveEntries(1, Root);
12315
+ while (!LiveEntries.empty()) {
12316
+ const TreeEntry *Entry = LiveEntries.pop_back_val();
12317
+ SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
12318
+ if (Operands.empty())
12319
+ continue;
12320
+ Instruction *LastInst = EntriesToLastInstruction.at(Entry);
12321
+ BasicBlock *Parent = LastInst->getParent();
12322
+ for (const TreeEntry *Op : Operands) {
12323
+ if (!Op->isGather())
12324
+ LiveEntries.push_back(Op);
12325
+ if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
12326
+ (Op->isGather() && allConstant(Op->Scalars)))
12327
+ continue;
12328
+ Budget = 0;
12329
+ BasicBlock *Pred = nullptr;
12330
+ if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
12331
+ Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12332
+ BasicBlock *OpParent;
12333
+ Instruction *OpLastInst;
12334
+ if (Op->isGather()) {
12335
+ assert(Entry->getOpcode() == Instruction::PHI &&
12336
+ "Expected phi node only.");
12337
+ OpParent = cast<PHINode>(Entry->getMainOp())
12338
+ ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12339
+ OpLastInst = OpParent->getTerminator();
12340
+ for (Value *V : Op->Scalars) {
12341
+ auto *Inst = dyn_cast<Instruction>(V);
12342
+ if (!Inst)
12343
+ continue;
12344
+ if (isVectorized(V)) {
12345
+ OpParent = Inst->getParent();
12346
+ OpLastInst = Inst;
12347
+ break;
12348
+ }
12349
+ }
12350
+ } else {
12351
+ OpLastInst = EntriesToLastInstruction.at(Op);
12352
+ OpParent = OpLastInst->getParent();
12353
+ }
12354
+ // Check the call instructions within the same basic blocks.
12355
+ if (OpParent == Parent) {
12356
+ if (Entry->getOpcode() == Instruction::PHI) {
12357
+ if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
12358
+ AddCosts(Op);
12359
+ continue;
12360
+ }
12361
+ if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
12362
+ AddCosts(Op);
12363
+ continue;
12364
+ }
12365
+ // Check for call instruction in between blocks.
12366
+ // 1. Check entry's block to the head.
12367
+ if (Entry->getOpcode() != Instruction::PHI &&
12368
+ !CheckForNonVecCallsInSameBlock(
12369
+ &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
12370
+ LastInst)) {
12371
+ AddCosts(Op);
12372
+ continue;
12373
+ }
12374
+ // 2. Check op's block from the end.
12375
+ if (!CheckForNonVecCallsInSameBlock(OpLastInst,
12376
+ OpParent->getTerminator())) {
12377
+ AddCosts(Op);
12378
+ continue;
12379
+ }
12380
+ // 3. Check the predecessors of entry's block till op's block.
12381
+ if (!CheckPredecessors(Parent, Pred, OpParent)) {
12382
+ AddCosts(Op);
12383
+ continue;
12269
12384
}
12270
- Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
12271
12385
}
12272
-
12273
- PrevInst = Inst;
12274
12386
}
12275
12387
12276
12388
return Cost;
@@ -12778,8 +12890,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
12778
12890
}
12779
12891
}
12780
12892
12781
- InstructionCost SpillCost = getSpillCost();
12782
- Cost += SpillCost + ExtractCost;
12893
+ Cost += ExtractCost;
12783
12894
auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12784
12895
bool) {
12785
12896
InstructionCost C = 0;
@@ -12918,12 +13029,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
12918
13029
}
12919
13030
}
12920
13031
13032
+ std::optional<InstructionCost> SpillCost;
13033
+ if (Cost < -SLPCostThreshold) {
13034
+ SpillCost = getSpillCost();
13035
+ Cost += *SpillCost;
13036
+ }
12921
13037
#ifndef NDEBUG
12922
13038
SmallString<256> Str;
12923
13039
{
12924
13040
raw_svector_ostream OS(Str);
12925
- OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12926
- << "SLP: Extract Cost = " << ExtractCost << ".\n"
13041
+ OS << "SLP: Spill Cost = ";
13042
+ if (SpillCost)
13043
+ OS << *SpillCost;
13044
+ else
13045
+ OS << "<skipped>";
13046
+ OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
12927
13047
<< "SLP: Total Cost = " << Cost << ".\n";
12928
13048
}
12929
13049
LLVM_DEBUG(dbgs() << Str);
0 commit comments