Skip to content

Commit 71bcead

Browse files
committed
[SLP] Make reordering aware of external vectorizable scalar stores.
The current reordering scheme only checks the ordering of in-tree operands. There are some cases, however, where we need to adjust the ordering based on the ordering of a future SLP-tree who's instructions are not part of the current tree, but are external users. This patch is a simple implementation of this. We keep track of scalar stores that are users of TreeEntries and if they look profitable to vectorize, then we keep track of their ordering. During the reordering step we take this new index order into account. This can remove some shuffles in cases like in the lit test. Differential Revision: https://reviews.llvm.org/D125111
1 parent 7731935 commit 71bcead

File tree

2 files changed

+204
-9
lines changed

2 files changed

+204
-9
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 199 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2184,6 +2184,29 @@ class BoUpSLP {
21842184
const DataLayout &DL,
21852185
ScalarEvolution &SE,
21862186
const BoUpSLP &R);
2187+
2188+
/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2189+
/// users of \p TE and collects the stores. It returns the map from the store
2190+
/// pointers to the collected stores.
2191+
DenseMap<Value *, SmallVector<StoreInst *, 4>>
2192+
collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2193+
2194+
/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2195+
/// stores in \p StoresVec can for a vector instruction. If so it returns true
2196+
/// and populates \p ReorderIndices with the shuffle indices of the the stores
2197+
/// when compared to the sorted vector.
2198+
bool CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
2199+
OrdersType &ReorderIndices) const;
2200+
2201+
/// Iterates through the users of \p TE, looking for scalar stores that can be
2202+
/// potentially vectorized in a future SLP-tree. If found, it keeps track of
2203+
/// their order and builds an order index vector for each store bundle. It
2204+
/// returns all these order vectors found.
2205+
/// We run this after the tree has formed, otherwise we may come across user
2206+
/// instructions that are not yet in the tree.
2207+
SmallVector<OrdersType, 1>
2208+
findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2209+
21872210
struct TreeEntry {
21882211
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
21892212
TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -3584,11 +3607,25 @@ void BoUpSLP::reorderTopToBottom() {
35843607
// ExtractElement gather nodes which can be vectorized and need to handle
35853608
// their ordering.
35863609
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
3610+
3611+
// Maps a TreeEntry to the reorder indices of external users.
3612+
DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
3613+
ExternalUserReorderMap;
35873614
// Find all reorderable nodes with the given VF.
35883615
// Currently the are vectorized stores,loads,extracts + some gathering of
35893616
// extracts.
3590-
for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
3617+
for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders,
3618+
&ExternalUserReorderMap](
35913619
const std::unique_ptr<TreeEntry> &TE) {
3620+
// Look for external users that will probably be vectorized.
3621+
SmallVector<OrdersType, 1> ExternalUserReorderIndices =
3622+
findExternalStoreUsersReorderIndices(TE.get());
3623+
if (!ExternalUserReorderIndices.empty()) {
3624+
VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
3625+
ExternalUserReorderMap.try_emplace(TE.get(),
3626+
std::move(ExternalUserReorderIndices));
3627+
}
3628+
35923629
if (Optional<OrdersType> CurrentOrder =
35933630
getReorderingData(*TE, /*TopToBottom=*/true)) {
35943631
// Do not include ordering for nodes used in the alt opcode vectorization,
@@ -3643,10 +3680,23 @@ void BoUpSLP::reorderTopToBottom() {
36433680
continue;
36443681
// Count number of orders uses.
36453682
const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
3646-
if (OpTE->State == TreeEntry::NeedToGather)
3647-
return GathersToOrders.find(OpTE)->second;
3683+
if (OpTE->State == TreeEntry::NeedToGather) {
3684+
auto It = GathersToOrders.find(OpTE);
3685+
if (It != GathersToOrders.end())
3686+
return It->second;
3687+
}
36483688
return OpTE->ReorderIndices;
36493689
}();
3690+
// First consider the order of the external scalar users.
3691+
auto It = ExternalUserReorderMap.find(OpTE);
3692+
if (It != ExternalUserReorderMap.end()) {
3693+
const auto &ExternalUserReorderIndices = It->second;
3694+
for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
3695+
++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
3696+
// No other useful reorder data in this entry.
3697+
if (Order.empty())
3698+
continue;
3699+
}
36503700
// Stores actually store the mask, not the order, need to invert.
36513701
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
36523702
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -4078,6 +4128,152 @@ void BoUpSLP::buildExternalUses(
40784128
}
40794129
}
40804130

4131+
DenseMap<Value *, SmallVector<StoreInst *, 4>>
4132+
BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
4133+
DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap;
4134+
for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
4135+
Value *V = TE->Scalars[Lane];
4136+
// To save compilation time we don't visit if we have too many users.
4137+
static constexpr unsigned UsersLimit = 4;
4138+
if (V->hasNUsesOrMore(UsersLimit))
4139+
break;
4140+
4141+
// Collect stores per pointer object.
4142+
for (User *U : V->users()) {
4143+
auto *SI = dyn_cast<StoreInst>(U);
4144+
if (SI == nullptr || !SI->isSimple() ||
4145+
!isValidElementType(SI->getValueOperand()->getType()))
4146+
continue;
4147+
// Skip entry if already
4148+
if (getTreeEntry(U))
4149+
continue;
4150+
4151+
Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
4152+
auto &StoresVec = PtrToStoresMap[Ptr];
4153+
// For now just keep one store per pointer object per lane.
4154+
// TODO: Extend this to support multiple stores per pointer per lane
4155+
if (StoresVec.size() > Lane)
4156+
continue;
4157+
// Skip if in different BBs.
4158+
if (!StoresVec.empty() &&
4159+
SI->getParent() != StoresVec.back()->getParent())
4160+
continue;
4161+
// Make sure that the stores are of the same type.
4162+
if (!StoresVec.empty() &&
4163+
SI->getValueOperand()->getType() !=
4164+
StoresVec.back()->getValueOperand()->getType())
4165+
continue;
4166+
StoresVec.push_back(SI);
4167+
}
4168+
}
4169+
return PtrToStoresMap;
4170+
}
4171+
4172+
bool BoUpSLP::CanFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
4173+
OrdersType &ReorderIndices) const {
4174+
// We check whether the stores in StoreVec can form a vector by sorting them
4175+
// and checking whether they are consecutive.
4176+
4177+
// To avoid calling getPointersDiff() while sorting we create a vector of
4178+
// pairs {store, offset from first} and sort this instead.
4179+
SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());
4180+
StoreInst *S0 = StoresVec[0];
4181+
StoreOffsetVec[0] = {S0, 0};
4182+
Type *S0Ty = S0->getValueOperand()->getType();
4183+
Value *S0Ptr = S0->getPointerOperand();
4184+
for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
4185+
StoreInst *SI = StoresVec[Idx];
4186+
Optional<int> Diff =
4187+
getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
4188+
SI->getPointerOperand(), *DL, *SE,
4189+
/*StrictCheck=*/true);
4190+
// We failed to compare the pointers so just abandon this StoresVec.
4191+
if (!Diff)
4192+
return false;
4193+
StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
4194+
}
4195+
4196+
// Sort the vector based on the pointers. We create a copy because we may
4197+
// need the original later for calculating the reorder (shuffle) indices.
4198+
stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
4199+
const std::pair<StoreInst *, int> &Pair2) {
4200+
int Offset1 = Pair1.second;
4201+
int Offset2 = Pair2.second;
4202+
return Offset1 < Offset2;
4203+
});
4204+
4205+
// Check if the stores are consecutive by checking if last-first == size-1.
4206+
int LastOffset = StoreOffsetVec.back().second;
4207+
int FirstOffset = StoreOffsetVec.front().second;
4208+
if (LastOffset - FirstOffset != (int)StoreOffsetVec.size() - 1)
4209+
return false;
4210+
4211+
// Calculate the shuffle indices according to their offset against the sorted
4212+
// StoreOffsetVec.
4213+
ReorderIndices.reserve(StoresVec.size());
4214+
for (StoreInst *SI : StoresVec) {
4215+
unsigned Idx = find_if(StoreOffsetVec,
4216+
[SI](const std::pair<StoreInst *, int> &Pair) {
4217+
return Pair.first == SI;
4218+
}) -
4219+
StoreOffsetVec.begin();
4220+
ReorderIndices.push_back(Idx);
4221+
}
4222+
// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
4223+
// reorderTopToBottom() and reorderBottomToTop(), so we are following the
4224+
// same convention here.
4225+
auto IsIdentityOrder = [](const OrdersType &Order) {
4226+
for (unsigned Idx : seq<unsigned>(0, Order.size()))
4227+
if (Idx != Order[Idx])
4228+
return false;
4229+
return true;
4230+
};
4231+
if (IsIdentityOrder(ReorderIndices))
4232+
ReorderIndices.clear();
4233+
4234+
return true;
4235+
}
4236+
4237+
#ifndef NDEBUG
4238+
LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
4239+
for (unsigned Idx : Order)
4240+
dbgs() << Idx << ", ";
4241+
dbgs() << "\n";
4242+
}
4243+
#endif
4244+
4245+
SmallVector<BoUpSLP::OrdersType, 1>
4246+
BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
4247+
unsigned NumLanes = TE->Scalars.size();
4248+
4249+
DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap =
4250+
collectUserStores(TE);
4251+
4252+
// Holds the reorder indices for each candidate store vector that is a user of
4253+
// the current TreeEntry.
4254+
SmallVector<OrdersType, 1> ExternalReorderIndices;
4255+
4256+
// Now inspect the stores collected per pointer and look for vectorization
4257+
// candidates. For each candidate calculate the reorder index vector and push
4258+
// it into `ExternalReorderIndices`
4259+
for (const auto &Pair : PtrToStoresMap) {
4260+
auto &StoresVec = Pair.second;
4261+
// If we have fewer than NumLanes stores, then we can't form a vector.
4262+
if (StoresVec.size() != NumLanes)
4263+
continue;
4264+
4265+
// If the stores are not consecutive then abandon this StoresVec.
4266+
OrdersType ReorderIndices;
4267+
if (!CanFormVector(StoresVec, ReorderIndices))
4268+
continue;
4269+
4270+
// We now know that the scalars in StoresVec can form a vector instruction,
4271+
// so set the reorder indices.
4272+
ExternalReorderIndices.push_back(ReorderIndices);
4273+
}
4274+
return ExternalReorderIndices;
4275+
}
4276+
40814277
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
40824278
ArrayRef<Value *> UserIgnoreLst) {
40834279
deleteTree();

llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,17 @@ define void @rotate_with_external_users(double *%A, double *%ptr) {
1414
; CHECK-NEXT: [[LD:%.*]] = load double, double* undef, align 8
1515
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
1616
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD]], i32 1
17-
; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 1.100000e+00, double 2.200000e+00>
18-
; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], <double 1.100000e+00, double 2.200000e+00>
17+
; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 2.200000e+00, double 1.100000e+00>
18+
; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], <double 2.200000e+00, double 1.100000e+00>
1919
; CHECK-NEXT: [[PTRA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
20-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
2120
; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[PTRA1]] to <2 x double>*
22-
; CHECK-NEXT: store <2 x double> [[SHUFFLE]], <2 x double>* [[TMP4]], align 8
21+
; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
2322
; CHECK-NEXT: br label [[BB2:%.*]]
2423
; CHECK: bb2:
25-
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], <double 3.300000e+00, double 4.400000e+00>
24+
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], <double 4.400000e+00, double 3.300000e+00>
2625
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
2726
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
28-
; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP6]], [[TMP7]]
27+
; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP7]], [[TMP6]]
2928
; CHECK-NEXT: ret void
3029
;
3130
bb1:

0 commit comments

Comments
 (0)