Skip to content

Commit 2f6ca38

Browse files
committed
Revert "[SLP]Improve stores vectorization."
This reverts commit 58b0d7c to fix crashes reported in https://lab.llvm.org/buildbot/#/builders/85/builds/18117.
1 parent a6d6730 commit 2f6ca38

File tree

3 files changed

+135
-177
lines changed

3 files changed

+135
-177
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 127 additions & 175 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,10 @@ static cl::opt<unsigned>
140140
MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
141141
cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
142142

143+
static cl::opt<int>
144+
MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
145+
cl::desc("Maximum depth of the lookup for consecutive stores."));
146+
143147
/// Limits the size of scheduling regions in a block.
144148
/// It avoid long compile times for _very_ large blocks where vector
145149
/// instructions are spread over a wide range.
@@ -12435,185 +12439,138 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
1243512439
BoUpSLP::ValueSet VectorizedStores;
1243612440
bool Changed = false;
1243712441

12438-
// Stores the pair of stores (first_store, last_store) in a range, that were
12439-
// already tried to be vectorized. Allows to skip the store ranges that were
12440-
// already tried to be vectorized but the attempts were unsuccessful.
12441-
DenseSet<std::pair<Value *, Value *>> TriedSequences;
12442-
struct StoreDistCompare {
12443-
bool operator()(const std::pair<unsigned, int> &Op1,
12444-
const std::pair<unsigned, int> &Op2) const {
12445-
return Op1.second < Op2.second;
12442+
int E = Stores.size();
12443+
SmallBitVector Tails(E, false);
12444+
int MaxIter = MaxStoreLookup.getValue();
12445+
SmallVector<std::pair<int, int>, 16> ConsecutiveChain(
12446+
E, std::make_pair(E, INT_MAX));
12447+
SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false));
12448+
int IterCnt;
12449+
auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
12450+
&CheckedPairs,
12451+
&ConsecutiveChain](int K, int Idx) {
12452+
if (IterCnt >= MaxIter)
12453+
return true;
12454+
if (CheckedPairs[Idx].test(K))
12455+
return ConsecutiveChain[K].second == 1 &&
12456+
ConsecutiveChain[K].first == Idx;
12457+
++IterCnt;
12458+
CheckedPairs[Idx].set(K);
12459+
CheckedPairs[K].set(Idx);
12460+
std::optional<int> Diff = getPointersDiff(
12461+
Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(),
12462+
Stores[Idx]->getValueOperand()->getType(),
12463+
Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true);
12464+
if (!Diff || *Diff == 0)
12465+
return false;
12466+
int Val = *Diff;
12467+
if (Val < 0) {
12468+
if (ConsecutiveChain[Idx].second > -Val) {
12469+
Tails.set(K);
12470+
ConsecutiveChain[Idx] = std::make_pair(K, -Val);
12471+
}
12472+
return false;
1244612473
}
12474+
if (ConsecutiveChain[K].second <= Val)
12475+
return false;
12476+
12477+
Tails.set(Idx);
12478+
ConsecutiveChain[K] = std::make_pair(Idx, Val);
12479+
return Val == 1;
1244712480
};
12448-
// A set of pairs (index of store in Stores array ref, Distance of the store
12449-
// address relative to base store address in units).
12450-
using StoreIndexToDistSet =
12451-
std::set<std::pair<unsigned, int>, StoreDistCompare>;
12452-
auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
12453-
int PrevDist = -1;
12481+
// Do a quadratic search on all of the given stores in reverse order and find
12482+
// all of the pairs of stores that follow each other.
12483+
for (int Idx = E - 1; Idx >= 0; --Idx) {
12484+
// If a store has multiple consecutive store candidates, search according
12485+
// to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
12486+
// This is because usually pairing with immediate succeeding or preceding
12487+
// candidate create the best chance to find slp vectorization opportunity.
12488+
const int MaxLookDepth = std::max(E - Idx, Idx + 1);
12489+
IterCnt = 0;
12490+
for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
12491+
if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
12492+
(Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
12493+
break;
12494+
}
12495+
12496+
// Tracks if we tried to vectorize stores starting from the given tail
12497+
// already.
12498+
SmallBitVector TriedTails(E, false);
12499+
// For stores that start but don't end a link in the chain:
12500+
for (int Cnt = E; Cnt > 0; --Cnt) {
12501+
int I = Cnt - 1;
12502+
if (ConsecutiveChain[I].first == E || Tails.test(I))
12503+
continue;
12504+
// We found a store instr that starts a chain. Now follow the chain and try
12505+
// to vectorize it.
1245412506
BoUpSLP::ValueList Operands;
1245512507
// Collect the chain into a list.
12456-
for (auto [Idx, Data] : enumerate(Set)) {
12457-
if (Operands.empty() || Data.second - PrevDist == 1) {
12458-
Operands.push_back(Stores[Data.first]);
12459-
PrevDist = Data.second;
12460-
if (Idx != Set.size() - 1)
12461-
continue;
12462-
}
12463-
if (Operands.size() <= 1) {
12464-
Operands.clear();
12465-
Operands.push_back(Stores[Data.first]);
12466-
PrevDist = Data.second;
12467-
continue;
12468-
}
12469-
12470-
unsigned MaxVecRegSize = R.getMaxVecRegSize();
12471-
unsigned EltSize = R.getVectorElementSize(Operands[0]);
12472-
unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
12473-
12474-
unsigned MaxVF =
12475-
std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
12476-
auto *Store = cast<StoreInst>(Operands[0]);
12477-
Type *StoreTy = Store->getValueOperand()->getType();
12478-
Type *ValueTy = StoreTy;
12479-
if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
12480-
ValueTy = Trunc->getSrcTy();
12481-
unsigned MinVF = TTI->getStoreMinimumVF(
12482-
R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
12483-
12484-
if (MaxVF <= MinVF) {
12485-
LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
12486-
<< ") <= "
12487-
<< "MinVF (" << MinVF << ")\n");
12488-
}
12489-
12490-
// FIXME: Is division-by-2 the correct step? Should we assert that the
12491-
// register size is a power-of-2?
12492-
unsigned StartIdx = 0;
12493-
for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
12494-
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
12495-
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
12496-
if (!VectorizedStores.count(Slice.front()) &&
12497-
!VectorizedStores.count(Slice.back()) &&
12498-
TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
12499-
.second &&
12500-
vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
12501-
// Mark the vectorized stores so that we don't vectorize them again.
12502-
VectorizedStores.insert(Slice.begin(), Slice.end());
12503-
Changed = true;
12504-
// If we vectorized initial block, no need to try to vectorize it
12505-
// again.
12506-
if (Cnt == StartIdx)
12507-
StartIdx += Size;
12508-
Cnt += Size;
12509-
continue;
12510-
}
12511-
++Cnt;
12508+
while (I != E && !VectorizedStores.count(Stores[I])) {
12509+
Operands.push_back(Stores[I]);
12510+
Tails.set(I);
12511+
if (ConsecutiveChain[I].second != 1) {
12512+
// Mark the new end in the chain and go back, if required. It might be
12513+
// required if the original stores come in reversed order, for example.
12514+
if (ConsecutiveChain[I].first != E &&
12515+
Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) &&
12516+
!VectorizedStores.count(Stores[ConsecutiveChain[I].first])) {
12517+
TriedTails.set(I);
12518+
Tails.reset(ConsecutiveChain[I].first);
12519+
if (Cnt < ConsecutiveChain[I].first + 2)
12520+
Cnt = ConsecutiveChain[I].first + 2;
1251212521
}
12513-
// Check if the whole array was vectorized already - exit.
12514-
if (StartIdx >= Operands.size())
12515-
break;
12522+
break;
1251612523
}
12517-
Operands.clear();
12518-
Operands.push_back(Stores[Data.first]);
12519-
PrevDist = Data.second;
12524+
// Move to the next value in the chain.
12525+
I = ConsecutiveChain[I].first;
1252012526
}
12521-
};
12527+
assert(!Operands.empty() && "Expected non-empty list of stores.");
1252212528

12523-
// Stores pair (first: index of the store into Stores array ref, address of
12524-
// which taken as base, second: sorted set of pairs {index, dist}, which are
12525-
// indices of stores in the set and their store location distances relative to
12526-
// the base address).
12527-
12528-
// Need to store the index of the very first store separately, since the set
12529-
// may be reordered after the insertion and the first store may be moved. This
12530-
// container allows to reduce number of calls of getPointersDiff() function.
12531-
SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
12532-
// Inserts the specified store SI with the given index Idx to the set of the
12533-
// stores. If the store with the same distance is found already - stop
12534-
// insertion, try to vectorize already found stores. If some stores from this
12535-
// sequence were not vectorized - try to vectorize them with the new store
12536-
// later. But this logic is applied only to the stores, that come before the
12537-
// previous store with the same distance.
12538-
// Example:
12539-
// 1. store x, %p
12540-
// 2. store y, %p+1
12541-
// 3. store z, %p+2
12542-
// 4. store a, %p
12543-
// 5. store b, %p+3
12544-
// - Scan this from the last to first store. The very first bunch of stores is
12545-
// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
12546-
// vector).
12547-
// - The next store in the list - #1 - has the same distance from store #5 as
12548-
// the store #4.
12549-
// - Try to vectorize sequence of stores 4,2,3,5.
12550-
// - If all these stores are vectorized - just drop them.
12551-
// - If some of them are not vectorized (say, #3 and #5), do extra analysis.
12552-
// - Start new stores sequence.
12553-
// The new bunch of stores is {1, {1, 0}}.
12554-
// - Add the stores from previous sequence, that were not vectorized.
12555-
// Here we consider the stores in the reversed order, rather they are used in
12556-
// the IR (Stores are reversed already, see vectorizeStoreChains() function).
12557-
// Store #3 can be added -> comes after store #4 with the same distance as
12558-
// store #1.
12559-
// Store #5 cannot be added - comes before store #4.
12560-
// This logic allows to improve the compile time, we assume that the stores
12561-
// after previous store with the same distance most likely have memory
12562-
// dependencies and no need to waste compile time to try to vectorize them.
12563-
// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
12564-
auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
12565-
for (auto &Set : SortedStores) {
12566-
std::optional<int> Diff = getPointersDiff(
12567-
Stores[Set.first]->getValueOperand()->getType(),
12568-
Stores[Set.first]->getPointerOperand(),
12569-
SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
12570-
/*StrictCheck=*/true);
12571-
if (!Diff)
12572-
continue;
12573-
auto It = Set.second.find(std::make_pair(Idx, *Diff));
12574-
if (It == Set.second.end()) {
12575-
Set.second.emplace(Idx, *Diff);
12576-
return;
12577-
}
12578-
// Try to vectorize the first found set to avoid duplicate analysis.
12579-
TryToVectorize(Set.second);
12580-
StoreIndexToDistSet PrevSet;
12581-
PrevSet.swap(Set.second);
12582-
Set.first = Idx;
12583-
Set.second.emplace(Idx, 0);
12584-
// Insert stores that followed previous match to try to vectorize them
12585-
// with this store.
12586-
unsigned StartIdx = It->first + 1;
12587-
SmallBitVector UsedStores(Idx - StartIdx);
12588-
// Distances to previously found dup store (or this store, since they
12589-
// store to the same addresses).
12590-
SmallVector<int> Dists(Idx - StartIdx, 0);
12591-
for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
12592-
// Do not try to vectorize sequences, we already tried.
12593-
if (Pair.first <= It->first ||
12594-
VectorizedStores.contains(Stores[Pair.first]))
12595-
break;
12596-
unsigned BI = Pair.first - StartIdx;
12597-
UsedStores.set(BI);
12598-
Dists[BI] = Pair.second - It->second;
12599-
}
12600-
for (unsigned I = StartIdx; I < Idx; ++I) {
12601-
unsigned BI = I - StartIdx;
12602-
if (BI < UsedStores.size() && UsedStores.test(BI))
12603-
Set.second.emplace(I, Dists[BI]);
12529+
unsigned MaxVecRegSize = R.getMaxVecRegSize();
12530+
unsigned EltSize = R.getVectorElementSize(Operands[0]);
12531+
unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
12532+
12533+
unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
12534+
MaxElts);
12535+
auto *Store = cast<StoreInst>(Operands[0]);
12536+
Type *StoreTy = Store->getValueOperand()->getType();
12537+
Type *ValueTy = StoreTy;
12538+
if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
12539+
ValueTy = Trunc->getSrcTy();
12540+
unsigned MinVF = TTI->getStoreMinimumVF(
12541+
R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
12542+
12543+
if (MaxVF <= MinVF) {
12544+
LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= "
12545+
<< "MinVF (" << MinVF << ")\n");
12546+
}
12547+
12548+
// FIXME: Is division-by-2 the correct step? Should we assert that the
12549+
// register size is a power-of-2?
12550+
unsigned StartIdx = 0;
12551+
for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
12552+
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
12553+
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
12554+
if (!VectorizedStores.count(Slice.front()) &&
12555+
!VectorizedStores.count(Slice.back()) &&
12556+
vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
12557+
// Mark the vectorized stores so that we don't vectorize them again.
12558+
VectorizedStores.insert(Slice.begin(), Slice.end());
12559+
Changed = true;
12560+
// If we vectorized initial block, no need to try to vectorize it
12561+
// again.
12562+
if (Cnt == StartIdx)
12563+
StartIdx += Size;
12564+
Cnt += Size;
12565+
continue;
12566+
}
12567+
++Cnt;
1260412568
}
12605-
return;
12569+
// Check if the whole array was vectorized already - exit.
12570+
if (StartIdx >= Operands.size())
12571+
break;
1260612572
}
12607-
auto &Res = SortedStores.emplace_back();
12608-
Res.first = Idx;
12609-
Res.second.emplace(Idx, 0);
12610-
};
12611-
for (auto [I, SI] : enumerate(Stores))
12612-
FillStoresSet(I, SI);
12613-
12614-
// Final vectorization attempt.
12615-
for (auto &Set : SortedStores)
12616-
TryToVectorize(Set.second);
12573+
}
1261712574

1261812575
return Changed;
1261912576
}
@@ -15247,13 +15204,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
1524715204
if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
1524815205
continue;
1524915206

15250-
// Reverse stores to do bottom-to-top analysis. This is important if the
15251-
// values are stores to the same addresses several times, in this case need
15252-
// to follow the stores order (reversed to meet the memory dependecies).
15253-
SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
15254-
Pair.second.rend());
1525515207
Changed |= tryToVectorizeSequence<StoreInst>(
15256-
ReversedStores, StoreSorter, AreCompatibleStores,
15208+
Pair.second, StoreSorter, AreCompatibleStores,
1525715209
[this, &R](ArrayRef<StoreInst *> Candidates, bool) {
1525815210
return vectorizeStores(Candidates, R);
1525915211
},

llvm/test/Transforms/SLPVectorizer/X86/many_stores.ll

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ define i32 @test(ptr %p) {
55
; CHECK-LABEL: define i32 @test
66
; CHECK-SAME: (ptr [[P:%.*]]) {
77
; CHECK-NEXT: entry:
8+
; CHECK-NEXT: [[IDX2:%.*]] = getelementptr i32, ptr [[P]], i64 4
9+
; CHECK-NEXT: store i32 0, ptr [[IDX2]], align 4
10+
; CHECK-NEXT: [[IDX3:%.*]] = getelementptr i32, ptr [[P]], i64 6
11+
; CHECK-NEXT: store i32 0, ptr [[IDX3]], align 4
812
; CHECK-NEXT: [[IDX4:%.*]] = getelementptr i32, ptr [[P]], i64 8
913
; CHECK-NEXT: store i32 0, ptr [[IDX4]], align 4
1014
; CHECK-NEXT: [[IDX5:%.*]] = getelementptr i32, ptr [[P]], i64 10
@@ -65,7 +69,9 @@ define i32 @test(ptr %p) {
6569
; CHECK-NEXT: store i32 0, ptr [[IDX33]], align 4
6670
; CHECK-NEXT: store i32 0, ptr [[P]], align 4
6771
; CHECK-NEXT: [[IDX0:%.*]] = getelementptr i32, ptr [[P]], i64 3
68-
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[IDX0]], align 4
72+
; CHECK-NEXT: store i32 0, ptr [[IDX0]], align 4
73+
; CHECK-NEXT: [[IDX1:%.*]] = getelementptr i32, ptr [[P]], i64 5
74+
; CHECK-NEXT: store i32 0, ptr [[IDX1]], align 4
6975
; CHECK-NEXT: ret i32 0
7076
;
7177
entry:

llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -S -mtriple=x86_64-unknown -passes=slp-vectorizer -slp-min-reg-size=64 -slp-threshold=-1000 | FileCheck %s
2+
; RUN: opt < %s -S -mtriple=x86_64-unknown -passes=slp-vectorizer -slp-max-store-lookup=2 -slp-min-reg-size=64 -slp-threshold=-1000 | FileCheck %s
33

44
define i32 @non-ordered-stores(ptr noalias nocapture %in, ptr noalias nocapture %inn, ptr noalias nocapture %out) {
55
; CHECK-LABEL: @non-ordered-stores(

0 commit comments

Comments
 (0)