@@ -12449,111 +12449,224 @@ InstructionCost BoUpSLP::getSpillCost() {
12449
12449
// live. When we see a call instruction that is not part of our tree,
12450
12450
// query TTI to see if there is a cost to keeping values live over it
12451
12451
// (for example, if spills and fills are required).
12452
- InstructionCost Cost = 0;
12453
12452
12454
- SmallPtrSet<const TreeEntry *, 4> LiveEntries;
12455
- const TreeEntry *Prev = nullptr;
12453
+ const TreeEntry *Root = VectorizableTree.front().get();
12454
+ if (Root->isGather())
12455
+ return 0;
12456
12456
12457
- // The entries in VectorizableTree are not necessarily ordered by their
12458
- // position in basic blocks. Collect them and order them by dominance so later
12459
- // instructions are guaranteed to be visited first. For instructions in
12460
- // different basic blocks, we only scan to the beginning of the block, so
12461
- // their order does not matter, as long as all instructions in a basic block
12462
- // are grouped together. Using dominance ensures a deterministic order.
12463
- SmallVector<TreeEntry *, 16> OrderedEntries;
12457
+ InstructionCost Cost = 0;
12458
+ SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
12459
+ EntriesToOperands;
12460
+ SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
12461
+ SmallPtrSet<const Instruction *, 8> LastInstructions;
12464
12462
for (const auto &TEPtr : VectorizableTree) {
12465
- if (TEPtr->isGather())
12466
- continue;
12467
- OrderedEntries.push_back(TEPtr.get());
12468
- }
12469
- llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
12470
- const TreeEntry *TB) {
12471
- Instruction &A = getLastInstructionInBundle(TA);
12472
- Instruction &B = getLastInstructionInBundle(TB);
12473
- auto *NodeA = DT->getNode(A.getParent());
12474
- auto *NodeB = DT->getNode(B.getParent());
12475
- assert(NodeA && "Should only process reachable instructions");
12476
- assert(NodeB && "Should only process reachable instructions");
12477
- assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12478
- "Different nodes should have different DFS numbers");
12479
- if (NodeA != NodeB)
12480
- return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12481
- return B.comesBefore(&A);
12482
- });
12483
-
12484
- for (const TreeEntry *TE : OrderedEntries) {
12485
- if (!Prev) {
12486
- Prev = TE;
12487
- continue;
12488
- }
12489
-
12490
- LiveEntries.erase(Prev);
12491
- for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
12492
- const TreeEntry *Op = getVectorizedOperand(Prev, I);
12493
- if (!Op)
12494
- continue;
12495
- assert(!Op->isGather() && "Expected vectorized operand.");
12496
- LiveEntries.insert(Op);
12463
+ if (!TEPtr->isGather()) {
12464
+ Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
12465
+ EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
12466
+ LastInstructions.insert(LastInst);
12497
12467
}
12468
+ if (TEPtr->UserTreeIndex)
12469
+ EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
12470
+ }
12498
12471
12499
- LLVM_DEBUG({
12500
- dbgs() << "SLP: #LV: " << LiveEntries.size();
12501
- for (auto *X : LiveEntries)
12502
- X->dump();
12503
- dbgs() << ", Looking at ";
12504
- TE->dump();
12505
- });
12506
-
12507
- // Now find the sequence of instructions between PrevInst and Inst.
12508
- unsigned NumCalls = 0;
12509
- const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
12510
- BasicBlock::const_reverse_iterator
12511
- InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
12512
- PrevInstIt = PrevInst->getIterator().getReverse();
12513
- while (InstIt != PrevInstIt) {
12514
- if (PrevInstIt == PrevInst->getParent()->rend()) {
12515
- PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
12516
- continue;
12517
- }
12518
-
12519
- auto NoCallIntrinsic = [this](const Instruction *I) {
12520
- const auto *II = dyn_cast<IntrinsicInst>(I);
12521
- if (!II)
12522
- return false;
12523
- if (II->isAssumeLikeIntrinsic())
12524
- return true;
12525
- IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12526
- InstructionCost IntrCost =
12527
- TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12528
- InstructionCost CallCost =
12529
- TTI->getCallInstrCost(nullptr, II->getType(), ICA.getArgTypes(),
12530
- TTI::TCK_RecipThroughput);
12531
- return IntrCost < CallCost;
12532
- };
12472
+ auto NoCallIntrinsic = [this](const Instruction *I) {
12473
+ const auto *II = dyn_cast<IntrinsicInst>(I);
12474
+ if (!II)
12475
+ return false;
12476
+ if (II->isAssumeLikeIntrinsic())
12477
+ return true;
12478
+ IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12479
+ InstructionCost IntrCost =
12480
+ TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
12481
+ InstructionCost CallCost = TTI->getCallInstrCost(
12482
+ nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
12483
+ return IntrCost < CallCost;
12484
+ };
12533
12485
12486
+ // Maps last instruction in the entry to the last instruction for the one of
12487
+ // operand entries and the flag. If the flag is true, there are no calls in
12488
+ // between these instructions.
12489
+ SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
12490
+ CheckedInstructions;
12491
+ unsigned Budget = 0;
12492
+ const unsigned BudgetLimit =
12493
+ ScheduleRegionSizeBudget / VectorizableTree.size();
12494
+ auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
12495
+ const Instruction *Last) {
12496
+ assert(First->getParent() == Last->getParent() &&
12497
+ "Expected instructions in same block.");
12498
+ if (auto It = CheckedInstructions.find(Last);
12499
+ It != CheckedInstructions.end()) {
12500
+ const Instruction *Checked = It->second.getPointer();
12501
+ if (Checked == First || Checked->comesBefore(First))
12502
+ return It->second.getInt() != 0;
12503
+ Last = Checked;
12504
+ } else if (Last == First || Last->comesBefore(First)) {
12505
+ return true;
12506
+ }
12507
+ BasicBlock::const_reverse_iterator InstIt =
12508
+ ++First->getIterator().getReverse(),
12509
+ PrevInstIt =
12510
+ Last->getIterator().getReverse();
12511
+ SmallVector<const Instruction *> LastInstsInRange;
12512
+ while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
12534
12513
// Debug information does not impact spill cost.
12535
12514
// Vectorized calls, represented as vector intrinsics, do not impact spill
12536
12515
// cost.
12537
12516
if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
12538
- CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
12539
- NumCalls++;
12517
+ CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
12518
+ for (const Instruction *LastInst : LastInstsInRange)
12519
+ CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
12520
+ return false;
12521
+ }
12522
+ if (LastInstructions.contains(&*PrevInstIt))
12523
+ LastInstsInRange.push_back(&*PrevInstIt);
12540
12524
12541
12525
++PrevInstIt;
12526
+ ++Budget;
12542
12527
}
12543
-
12544
- if (NumCalls) {
12545
- SmallVector<Type *, 4> EntriesTypes;
12546
- for (const TreeEntry *TE : LiveEntries) {
12547
- auto *ScalarTy = TE->getMainOp()->getType();
12548
- auto It = MinBWs.find(TE);
12549
- if (It != MinBWs.end())
12550
- ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12551
- EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
12528
+ for (const Instruction *LastInst : LastInstsInRange)
12529
+ CheckedInstructions.try_emplace(
12530
+ LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
12531
+ Budget <= BudgetLimit ? 1 : 0);
12532
+ return Budget <= BudgetLimit;
12533
+ };
12534
+ auto AddCosts = [&](const TreeEntry *Op) {
12535
+ Type *ScalarTy = Op->Scalars.front()->getType();
12536
+ auto It = MinBWs.find(Op);
12537
+ if (It != MinBWs.end())
12538
+ ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12539
+ auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
12540
+ Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
12541
+ if (ScalarTy->isVectorTy()) {
12542
+ // Handle revec dead vector instructions.
12543
+ Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
12544
+ }
12545
+ };
12546
+ // Memoize the relationship between blocks, i.e. if there is (at least one)
12547
+ // non-vectorized call between the blocks. This allows to skip the analysis of
12548
+ // the same block paths multiple times.
12549
+ SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
12550
+ ParentOpParentToPreds;
12551
+ auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
12552
+ BasicBlock *OpParent) {
12553
+ auto Key = std::make_pair(Root, OpParent);
12554
+ if (auto It = ParentOpParentToPreds.find(Key);
12555
+ It != ParentOpParentToPreds.end())
12556
+ return It->second;
12557
+ SmallVector<BasicBlock *> Worklist;
12558
+ if (Pred)
12559
+ Worklist.push_back(Pred);
12560
+ else
12561
+ Worklist.append(pred_begin(Root), pred_end(Root));
12562
+ SmallPtrSet<const BasicBlock *, 16> Visited;
12563
+ SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
12564
+ ParentsPairsToAdd;
12565
+ bool Res = false;
12566
+ auto Cleanup = make_scope_exit([&]() {
12567
+ for (const auto &KeyPair : ParentsPairsToAdd) {
12568
+ assert(!ParentOpParentToPreds.contains(KeyPair) &&
12569
+ "Should not have been added before.");
12570
+ ParentOpParentToPreds.try_emplace(KeyPair, Res);
12571
+ }
12572
+ });
12573
+ while (!Worklist.empty()) {
12574
+ BasicBlock *BB = Worklist.pop_back_val();
12575
+ if (BB == OpParent || !Visited.insert(BB).second)
12576
+ continue;
12577
+ auto Pair = std::make_pair(BB, OpParent);
12578
+ if (auto It = ParentOpParentToPreds.find(Pair);
12579
+ It != ParentOpParentToPreds.end()) {
12580
+ Res = It->second;
12581
+ return Res;
12582
+ }
12583
+ ParentsPairsToAdd.insert(Pair);
12584
+ unsigned BlockSize = BB->size();
12585
+ if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
12586
+ return Res;
12587
+ Budget += BlockSize;
12588
+ if (Budget > BudgetLimit)
12589
+ return Res;
12590
+ if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
12591
+ BB->getTerminator()))
12592
+ return Res;
12593
+ Worklist.append(pred_begin(BB), pred_end(BB));
12594
+ }
12595
+ Res = true;
12596
+ return Res;
12597
+ };
12598
+ SmallVector<const TreeEntry *> LiveEntries(1, Root);
12599
+ while (!LiveEntries.empty()) {
12600
+ const TreeEntry *Entry = LiveEntries.pop_back_val();
12601
+ SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
12602
+ if (Operands.empty())
12603
+ continue;
12604
+ Instruction *LastInst = EntriesToLastInstruction.at(Entry);
12605
+ BasicBlock *Parent = LastInst->getParent();
12606
+ for (const TreeEntry *Op : Operands) {
12607
+ if (!Op->isGather())
12608
+ LiveEntries.push_back(Op);
12609
+ if ((Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
12610
+ (Op->isGather() && allConstant(Op->Scalars)))
12611
+ continue;
12612
+ Budget = 0;
12613
+ BasicBlock *Pred = nullptr;
12614
+ if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
12615
+ Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12616
+ BasicBlock *OpParent;
12617
+ Instruction *OpLastInst;
12618
+ if (Op->isGather()) {
12619
+ assert(Entry->getOpcode() == Instruction::PHI &&
12620
+ "Expected phi node only.");
12621
+ OpParent = cast<PHINode>(Entry->getMainOp())
12622
+ ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
12623
+ OpLastInst = OpParent->getTerminator();
12624
+ for (Value *V : Op->Scalars) {
12625
+ auto *Inst = dyn_cast<Instruction>(V);
12626
+ if (!Inst)
12627
+ continue;
12628
+ if (isVectorized(V)) {
12629
+ OpParent = Inst->getParent();
12630
+ OpLastInst = Inst;
12631
+ break;
12632
+ }
12633
+ }
12634
+ } else {
12635
+ OpLastInst = EntriesToLastInstruction.at(Op);
12636
+ OpParent = OpLastInst->getParent();
12637
+ }
12638
+ // Check the call instructions within the same basic blocks.
12639
+ if (OpParent == Parent) {
12640
+ if (Entry->getOpcode() == Instruction::PHI) {
12641
+ if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
12642
+ AddCosts(Op);
12643
+ continue;
12644
+ }
12645
+ if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
12646
+ AddCosts(Op);
12647
+ continue;
12648
+ }
12649
+ // Check for call instruction in between blocks.
12650
+ // 1. Check entry's block to the head.
12651
+ if (Entry->getOpcode() != Instruction::PHI &&
12652
+ !CheckForNonVecCallsInSameBlock(
12653
+ &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
12654
+ LastInst)) {
12655
+ AddCosts(Op);
12656
+ continue;
12657
+ }
12658
+ // 2. Check op's block from the end.
12659
+ if (!CheckForNonVecCallsInSameBlock(OpLastInst,
12660
+ OpParent->getTerminator())) {
12661
+ AddCosts(Op);
12662
+ continue;
12663
+ }
12664
+ // 3. Check the predecessors of entry's block till op's block.
12665
+ if (!CheckPredecessors(Parent, Pred, OpParent)) {
12666
+ AddCosts(Op);
12667
+ continue;
12552
12668
}
12553
- Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
12554
12669
}
12555
-
12556
- Prev = TE;
12557
12670
}
12558
12671
12559
12672
return Cost;
@@ -13061,8 +13174,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
13061
13174
}
13062
13175
}
13063
13176
13064
- InstructionCost SpillCost = getSpillCost();
13065
- Cost += SpillCost + ExtractCost;
13177
+ Cost += ExtractCost;
13066
13178
auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
13067
13179
bool) {
13068
13180
InstructionCost C = 0;
@@ -13201,12 +13313,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
13201
13313
}
13202
13314
}
13203
13315
13316
+ std::optional<InstructionCost> SpillCost;
13317
+ if (Cost < -SLPCostThreshold) {
13318
+ SpillCost = getSpillCost();
13319
+ Cost += *SpillCost;
13320
+ }
13204
13321
#ifndef NDEBUG
13205
13322
SmallString<256> Str;
13206
13323
{
13207
13324
raw_svector_ostream OS(Str);
13208
- OS << "SLP: Spill Cost = " << SpillCost << ".\n"
13209
- << "SLP: Extract Cost = " << ExtractCost << ".\n"
13325
+ OS << "SLP: Spill Cost = ";
13326
+ if (SpillCost)
13327
+ OS << *SpillCost;
13328
+ else
13329
+ OS << "<skipped>";
13330
+ OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
13210
13331
<< "SLP: Total Cost = " << Cost << ".\n";
13211
13332
}
13212
13333
LLVM_DEBUG(dbgs() << Str);
0 commit comments