@@ -140,10 +140,6 @@ static cl::opt<unsigned>
140
140
MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
141
141
cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
142
142
143
- static cl::opt<int>
144
- MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
145
- cl::desc("Maximum depth of the lookup for consecutive stores."));
146
-
147
143
/// Limits the size of scheduling regions in a block.
148
144
/// It avoid long compile times for _very_ large blocks where vector
149
145
/// instructions are spread over a wide range.
@@ -12439,139 +12435,206 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
12439
12435
BoUpSLP::ValueSet VectorizedStores;
12440
12436
bool Changed = false;
12441
12437
12442
- int E = Stores.size();
12443
- SmallBitVector Tails(E, false);
12444
- int MaxIter = MaxStoreLookup.getValue();
12445
- SmallVector<std::pair<int, int>, 16> ConsecutiveChain(
12446
- E, std::make_pair(E, INT_MAX));
12447
- SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false));
12448
- int IterCnt;
12449
- auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
12450
- &CheckedPairs,
12451
- &ConsecutiveChain](int K, int Idx) {
12452
- if (IterCnt >= MaxIter)
12453
- return true;
12454
- if (CheckedPairs[Idx].test(K))
12455
- return ConsecutiveChain[K].second == 1 &&
12456
- ConsecutiveChain[K].first == Idx;
12457
- ++IterCnt;
12458
- CheckedPairs[Idx].set(K);
12459
- CheckedPairs[K].set(Idx);
12460
- std::optional<int> Diff = getPointersDiff(
12461
- Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(),
12462
- Stores[Idx]->getValueOperand()->getType(),
12463
- Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true);
12464
- if (!Diff || *Diff == 0)
12465
- return false;
12466
- int Val = *Diff;
12467
- if (Val < 0) {
12468
- if (ConsecutiveChain[Idx].second > -Val) {
12469
- Tails.set(K);
12470
- ConsecutiveChain[Idx] = std::make_pair(K, -Val);
12471
- }
12472
- return false;
12438
+ // Stores the pair of stores (first_store, last_store) in a range, that were
12439
+ // already tried to be vectorized. Allows to skip the store ranges that were
12440
+ // already tried to be vectorized but the attempts were unsuccessful.
12441
+ DenseSet<std::pair<Value *, Value *>> TriedSequences;
12442
+ struct StoreDistCompare {
12443
+ bool operator()(const std::pair<unsigned, int> &Op1,
12444
+ const std::pair<unsigned, int> &Op2) const {
12445
+ return Op1.second < Op2.second;
12473
12446
}
12474
- if (ConsecutiveChain[K].second <= Val)
12475
- return false;
12476
-
12477
- Tails.set(Idx);
12478
- ConsecutiveChain[K] = std::make_pair(Idx, Val);
12479
- return Val == 1;
12480
12447
};
12481
- // Do a quadratic search on all of the given stores in reverse order and find
12482
- // all of the pairs of stores that follow each other.
12483
- for (int Idx = E - 1; Idx >= 0; --Idx) {
12484
- // If a store has multiple consecutive store candidates, search according
12485
- // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
12486
- // This is because usually pairing with immediate succeeding or preceding
12487
- // candidate create the best chance to find slp vectorization opportunity.
12488
- const int MaxLookDepth = std::max(E - Idx, Idx + 1);
12489
- IterCnt = 0;
12490
- for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
12491
- if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
12492
- (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
12493
- break;
12494
- }
12495
-
12496
- // Tracks if we tried to vectorize stores starting from the given tail
12497
- // already.
12498
- SmallBitVector TriedTails(E, false);
12499
- // For stores that start but don't end a link in the chain:
12500
- for (int Cnt = E; Cnt > 0; --Cnt) {
12501
- int I = Cnt - 1;
12502
- if (ConsecutiveChain[I].first == E || Tails.test(I))
12503
- continue;
12504
- // We found a store instr that starts a chain. Now follow the chain and try
12505
- // to vectorize it.
12448
+ // A set of pairs (index of store in Stores array ref, Distance of the store
12449
+ // address relative to base store address in units).
12450
+ using StoreIndexToDistSet =
12451
+ std::set<std::pair<unsigned, int>, StoreDistCompare>;
12452
+ auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
12453
+ int PrevDist = -1;
12506
12454
BoUpSLP::ValueList Operands;
12507
12455
// Collect the chain into a list.
12508
- while (I != E && !VectorizedStores.count(Stores[I])) {
12509
- Operands.push_back(Stores[I]);
12510
- Tails.set(I);
12511
- if (ConsecutiveChain[I].second != 1) {
12512
- // Mark the new end in the chain and go back, if required. It might be
12513
- // required if the original stores come in reversed order, for example.
12514
- if (ConsecutiveChain[I].first != E &&
12515
- Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) &&
12516
- !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) {
12517
- TriedTails.set(I);
12518
- Tails.reset(ConsecutiveChain[I].first);
12519
- if (Cnt < ConsecutiveChain[I].first + 2)
12520
- Cnt = ConsecutiveChain[I].first + 2;
12456
+ for (auto [Idx, Data] : enumerate(Set)) {
12457
+ if (Operands.empty() || Data.second - PrevDist == 1) {
12458
+ Operands.push_back(Stores[Data.first]);
12459
+ PrevDist = Data.second;
12460
+ if (Idx != Set.size() - 1)
12461
+ continue;
12462
+ }
12463
+ if (Operands.size() <= 1) {
12464
+ Operands.clear();
12465
+ Operands.push_back(Stores[Data.first]);
12466
+ PrevDist = Data.second;
12467
+ continue;
12468
+ }
12469
+
12470
+ unsigned MaxVecRegSize = R.getMaxVecRegSize();
12471
+ unsigned EltSize = R.getVectorElementSize(Operands[0]);
12472
+ unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
12473
+
12474
+ unsigned MaxVF =
12475
+ std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
12476
+ auto *Store = cast<StoreInst>(Operands[0]);
12477
+ Type *StoreTy = Store->getValueOperand()->getType();
12478
+ Type *ValueTy = StoreTy;
12479
+ if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
12480
+ ValueTy = Trunc->getSrcTy();
12481
+ unsigned MinVF = TTI->getStoreMinimumVF(
12482
+ R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
12483
+
12484
+ if (MaxVF <= MinVF) {
12485
+ LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
12486
+ << ") <= "
12487
+ << "MinVF (" << MinVF << ")\n");
12488
+ }
12489
+
12490
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
12491
+ // register size is a power-of-2?
12492
+ unsigned StartIdx = 0;
12493
+ for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
12494
+ for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
12495
+ ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
12496
+ assert(
12497
+ all_of(
12498
+ Slice,
12499
+ [&](Value *V) {
12500
+ return cast<StoreInst>(V)->getValueOperand()->getType() ==
12501
+ cast<StoreInst>(Slice.front())
12502
+ ->getValueOperand()
12503
+ ->getType();
12504
+ }) &&
12505
+ "Expected all operands of same type.");
12506
+ if (!VectorizedStores.count(Slice.front()) &&
12507
+ !VectorizedStores.count(Slice.back()) &&
12508
+ TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
12509
+ .second &&
12510
+ vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
12511
+ // Mark the vectorized stores so that we don't vectorize them again.
12512
+ VectorizedStores.insert(Slice.begin(), Slice.end());
12513
+ Changed = true;
12514
+ // If we vectorized initial block, no need to try to vectorize it
12515
+ // again.
12516
+ if (Cnt == StartIdx)
12517
+ StartIdx += Size;
12518
+ Cnt += Size;
12519
+ continue;
12520
+ }
12521
+ ++Cnt;
12521
12522
}
12522
- break;
12523
+ // Check if the whole array was vectorized already - exit.
12524
+ if (StartIdx >= Operands.size())
12525
+ break;
12523
12526
}
12524
- // Move to the next value in the chain.
12525
- I = ConsecutiveChain[I].first;
12527
+ Operands.clear();
12528
+ Operands.push_back(Stores[Data.first]);
12529
+ PrevDist = Data.second;
12526
12530
}
12527
- assert(!Operands.empty() && "Expected non-empty list of stores.") ;
12531
+ } ;
12528
12532
12529
- unsigned MaxVecRegSize = R.getMaxVecRegSize();
12530
- unsigned EltSize = R.getVectorElementSize(Operands[0]);
12531
- unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
12532
-
12533
- unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
12534
- MaxElts);
12535
- auto *Store = cast<StoreInst>(Operands[0]);
12536
- Type *StoreTy = Store->getValueOperand()->getType();
12537
- Type *ValueTy = StoreTy;
12538
- if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
12539
- ValueTy = Trunc->getSrcTy();
12540
- unsigned MinVF = TTI->getStoreMinimumVF(
12541
- R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
12542
-
12543
- if (MaxVF <= MinVF) {
12544
- LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= "
12545
- << "MinVF (" << MinVF << ")\n");
12546
- }
12547
-
12548
- // FIXME: Is division-by-2 the correct step? Should we assert that the
12549
- // register size is a power-of-2?
12550
- unsigned StartIdx = 0;
12551
- for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
12552
- for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
12553
- ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
12554
- if (!VectorizedStores.count(Slice.front()) &&
12555
- !VectorizedStores.count(Slice.back()) &&
12556
- vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
12557
- // Mark the vectorized stores so that we don't vectorize them again.
12558
- VectorizedStores.insert(Slice.begin(), Slice.end());
12559
- Changed = true;
12560
- // If we vectorized initial block, no need to try to vectorize it
12561
- // again.
12562
- if (Cnt == StartIdx)
12563
- StartIdx += Size;
12564
- Cnt += Size;
12565
- continue;
12566
- }
12567
- ++Cnt;
12533
+ // Stores pair (first: index of the store into Stores array ref, address of
12534
+ // which taken as base, second: sorted set of pairs {index, dist}, which are
12535
+ // indices of stores in the set and their store location distances relative to
12536
+ // the base address).
12537
+
12538
+ // Need to store the index of the very first store separately, since the set
12539
+ // may be reordered after the insertion and the first store may be moved. This
12540
+ // container allows to reduce number of calls of getPointersDiff() function.
12541
+ SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
12542
+ // Inserts the specified store SI with the given index Idx to the set of the
12543
+ // stores. If the store with the same distance is found already - stop
12544
+ // insertion, try to vectorize already found stores. If some stores from this
12545
+ // sequence were not vectorized - try to vectorize them with the new store
12546
+ // later. But this logic is applied only to the stores, that come before the
12547
+ // previous store with the same distance.
12548
+ // Example:
12549
+ // 1. store x, %p
12550
+ // 2. store y, %p+1
12551
+ // 3. store z, %p+2
12552
+ // 4. store a, %p
12553
+ // 5. store b, %p+3
12554
+ // - Scan this from the last to first store. The very first bunch of stores is
12555
+ // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
12556
+ // vector).
12557
+ // - The next store in the list - #1 - has the same distance from store #5 as
12558
+ // the store #4.
12559
+ // - Try to vectorize sequence of stores 4,2,3,5.
12560
+ // - If all these stores are vectorized - just drop them.
12561
+ // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
12562
+ // - Start new stores sequence.
12563
+ // The new bunch of stores is {1, {1, 0}}.
12564
+ // - Add the stores from previous sequence, that were not vectorized.
12565
+ // Here we consider the stores in the reversed order, rather they are used in
12566
+ // the IR (Stores are reversed already, see vectorizeStoreChains() function).
12567
+ // Store #3 can be added -> comes after store #4 with the same distance as
12568
+ // store #1.
12569
+ // Store #5 cannot be added - comes before store #4.
12570
+ // This logic allows to improve the compile time, we assume that the stores
12571
+ // after previous store with the same distance most likely have memory
12572
+ // dependencies and no need to waste compile time to try to vectorize them.
12573
+ // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
12574
+ auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
12575
+ for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
12576
+ std::optional<int> Diff = getPointersDiff(
12577
+ Stores[Set.first]->getValueOperand()->getType(),
12578
+ Stores[Set.first]->getPointerOperand(),
12579
+ SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
12580
+ /*StrictCheck=*/true);
12581
+ if (!Diff)
12582
+ continue;
12583
+ auto It = Set.second.find(std::make_pair(Idx, *Diff));
12584
+ if (It == Set.second.end()) {
12585
+ Set.second.emplace(Idx, *Diff);
12586
+ return;
12568
12587
}
12569
- // Check if the whole array was vectorized already - exit.
12570
- if (StartIdx >= Operands.size())
12571
- break;
12588
+ // Try to vectorize the first found set to avoid duplicate analysis.
12589
+ TryToVectorize(Set.second);
12590
+ StoreIndexToDistSet PrevSet;
12591
+ PrevSet.swap(Set.second);
12592
+ Set.first = Idx;
12593
+ Set.second.emplace(Idx, 0);
12594
+ // Insert stores that followed previous match to try to vectorize them
12595
+ // with this store.
12596
+ unsigned StartIdx = It->first + 1;
12597
+ SmallBitVector UsedStores(Idx - StartIdx);
12598
+ // Distances to previously found dup store (or this store, since they
12599
+ // store to the same addresses).
12600
+ SmallVector<int> Dists(Idx - StartIdx, 0);
12601
+ for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
12602
+ // Do not try to vectorize sequences, we already tried.
12603
+ if (Pair.first <= It->first ||
12604
+ VectorizedStores.contains(Stores[Pair.first]))
12605
+ break;
12606
+ unsigned BI = Pair.first - StartIdx;
12607
+ UsedStores.set(BI);
12608
+ Dists[BI] = Pair.second - It->second;
12609
+ }
12610
+ for (unsigned I = StartIdx; I < Idx; ++I) {
12611
+ unsigned BI = I - StartIdx;
12612
+ if (UsedStores.test(BI))
12613
+ Set.second.emplace(I, Dists[BI]);
12614
+ }
12615
+ return;
12572
12616
}
12617
+ auto &Res = SortedStores.emplace_back();
12618
+ Res.first = Idx;
12619
+ Res.second.emplace(Idx, 0);
12620
+ };
12621
+ StoreInst *PrevStore = Stores.front();
12622
+ for (auto [I, SI] : enumerate(Stores)) {
12623
+ // Check that we do not try to vectorize stores of different types.
12624
+ if (PrevStore->getValueOperand()->getType() !=
12625
+ SI->getValueOperand()->getType()) {
12626
+ for (auto &Set : SortedStores)
12627
+ TryToVectorize(Set.second);
12628
+ SortedStores.clear();
12629
+ PrevStore = SI;
12630
+ }
12631
+ FillStoresSet(I, SI);
12573
12632
}
12574
12633
12634
+ // Final vectorization attempt.
12635
+ for (auto &Set : SortedStores)
12636
+ TryToVectorize(Set.second);
12637
+
12575
12638
return Changed;
12576
12639
}
12577
12640
@@ -15135,6 +15198,12 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
15135
15198
// compatible (have the same opcode, same parent), otherwise it is
15136
15199
// definitely not profitable to try to vectorize them.
15137
15200
auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
15201
+ if (V->getValueOperand()->getType()->getTypeID() <
15202
+ V2->getValueOperand()->getType()->getTypeID())
15203
+ return true;
15204
+ if (V->getValueOperand()->getType()->getTypeID() >
15205
+ V2->getValueOperand()->getType()->getTypeID())
15206
+ return false;
15138
15207
if (V->getPointerOperandType()->getTypeID() <
15139
15208
V2->getPointerOperandType()->getTypeID())
15140
15209
return true;
@@ -15173,6 +15242,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
15173
15242
auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
15174
15243
if (V1 == V2)
15175
15244
return true;
15245
+ if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
15246
+ return false;
15176
15247
if (V1->getPointerOperandType() != V2->getPointerOperandType())
15177
15248
return false;
15178
15249
// Undefs are compatible with any other value.
@@ -15204,8 +15275,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
15204
15275
if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
15205
15276
continue;
15206
15277
15278
+ // Reverse stores to do bottom-to-top analysis. This is important if the
15279
+ // values are stores to the same addresses several times, in this case need
15280
+ // to follow the stores order (reversed to meet the memory dependecies).
15281
+ SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
15282
+ Pair.second.rend());
15207
15283
Changed |= tryToVectorizeSequence<StoreInst>(
15208
- Pair.second , StoreSorter, AreCompatibleStores,
15284
+ ReversedStores , StoreSorter, AreCompatibleStores,
15209
15285
[this, &R](ArrayRef<StoreInst *> Candidates, bool) {
15210
15286
return vectorizeStores(Candidates, R);
15211
15287
},
0 commit comments