@@ -2184,6 +2184,29 @@ class BoUpSLP {
2184
2184
const DataLayout &DL,
2185
2185
ScalarEvolution &SE,
2186
2186
const BoUpSLP &R);
2187
+
2188
+ // / Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2189
+ // / users of \p TE and collects the stores. It returns the map from the store
2190
+ // / pointers to the collected stores.
2191
+ DenseMap<Value *, SmallVector<StoreInst *, 4 >>
2192
+ collectUserStores (const BoUpSLP::TreeEntry *TE) const ;
2193
+
2194
+ // / Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2195
+ // / stores in \p StoresVec can for a vector instruction. If so it returns true
2196
+ // / and populates \p ReorderIndices with the shuffle indices of the the stores
2197
+ // / when compared to the sorted vector.
2198
+ bool CanFormVector (const SmallVector<StoreInst *, 4 > &StoresVec,
2199
+ OrdersType &ReorderIndices) const ;
2200
+
2201
+ // / Iterates through the users of \p TE, looking for scalar stores that can be
2202
+ // / potentially vectorized in a future SLP-tree. If found, it keeps track of
2203
+ // / their order and builds an order index vector for each store bundle. It
2204
+ // / returns all these order vectors found.
2205
+ // / We run this after the tree has formed, otherwise we may come across user
2206
+ // / instructions that are not yet in the tree.
2207
+ SmallVector<OrdersType, 1 >
2208
+ findExternalStoreUsersReorderIndices (TreeEntry *TE) const ;
2209
+
2187
2210
struct TreeEntry {
2188
2211
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8 >;
2189
2212
TreeEntry (VecTreeTy &Container) : Container(Container) {}
@@ -3584,11 +3607,25 @@ void BoUpSLP::reorderTopToBottom() {
3584
3607
// ExtractElement gather nodes which can be vectorized and need to handle
3585
3608
// their ordering.
3586
3609
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
3610
+
3611
+ // Maps a TreeEntry to the reorder indices of external users.
3612
+ DenseMap<const TreeEntry *, SmallVector<OrdersType, 1 >>
3613
+ ExternalUserReorderMap;
3587
3614
// Find all reorderable nodes with the given VF.
3588
3615
// Currently the are vectorized stores,loads,extracts + some gathering of
3589
3616
// extracts.
3590
- for_each (VectorizableTree, [this , &VFToOrderedEntries, &GathersToOrders](
3617
+ for_each (VectorizableTree, [this , &VFToOrderedEntries, &GathersToOrders,
3618
+ &ExternalUserReorderMap](
3591
3619
const std::unique_ptr<TreeEntry> &TE) {
3620
+ // Look for external users that will probably be vectorized.
3621
+ SmallVector<OrdersType, 1 > ExternalUserReorderIndices =
3622
+ findExternalStoreUsersReorderIndices (TE.get ());
3623
+ if (!ExternalUserReorderIndices.empty ()) {
3624
+ VFToOrderedEntries[TE->Scalars .size ()].insert (TE.get ());
3625
+ ExternalUserReorderMap.try_emplace (TE.get (),
3626
+ std::move (ExternalUserReorderIndices));
3627
+ }
3628
+
3592
3629
if (Optional<OrdersType> CurrentOrder =
3593
3630
getReorderingData (*TE, /* TopToBottom=*/ true )) {
3594
3631
// Do not include ordering for nodes used in the alt opcode vectorization,
@@ -3643,10 +3680,23 @@ void BoUpSLP::reorderTopToBottom() {
3643
3680
continue ;
3644
3681
// Count number of orders uses.
3645
3682
const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
3646
- if (OpTE->State == TreeEntry::NeedToGather)
3647
- return GathersToOrders.find (OpTE)->second ;
3683
+ if (OpTE->State == TreeEntry::NeedToGather) {
3684
+ auto It = GathersToOrders.find (OpTE);
3685
+ if (It != GathersToOrders.end ())
3686
+ return It->second ;
3687
+ }
3648
3688
return OpTE->ReorderIndices ;
3649
3689
}();
3690
+ // First consider the order of the external scalar users.
3691
+ auto It = ExternalUserReorderMap.find (OpTE);
3692
+ if (It != ExternalUserReorderMap.end ()) {
3693
+ const auto &ExternalUserReorderIndices = It->second ;
3694
+ for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
3695
+ ++OrdersUses.insert (std::make_pair (ExtOrder, 0 )).first ->second ;
3696
+ // No other useful reorder data in this entry.
3697
+ if (Order.empty ())
3698
+ continue ;
3699
+ }
3650
3700
// Stores actually store the mask, not the order, need to invert.
3651
3701
if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle () &&
3652
3702
OpTE->getOpcode () == Instruction::Store && !Order.empty ()) {
@@ -4078,6 +4128,152 @@ void BoUpSLP::buildExternalUses(
4078
4128
}
4079
4129
}
4080
4130
4131
+ DenseMap<Value *, SmallVector<StoreInst *, 4 >>
4132
+ BoUpSLP::collectUserStores (const BoUpSLP::TreeEntry *TE) const {
4133
+ DenseMap<Value *, SmallVector<StoreInst *, 4 >> PtrToStoresMap;
4134
+ for (unsigned Lane : seq<unsigned >(0 , TE->Scalars .size ())) {
4135
+ Value *V = TE->Scalars [Lane];
4136
+ // To save compilation time we don't visit if we have too many users.
4137
+ static constexpr unsigned UsersLimit = 4 ;
4138
+ if (V->hasNUsesOrMore (UsersLimit))
4139
+ break ;
4140
+
4141
+ // Collect stores per pointer object.
4142
+ for (User *U : V->users ()) {
4143
+ auto *SI = dyn_cast<StoreInst>(U);
4144
+ if (SI == nullptr || !SI->isSimple () ||
4145
+ !isValidElementType (SI->getValueOperand ()->getType ()))
4146
+ continue ;
4147
+ // Skip entry if already
4148
+ if (getTreeEntry (U))
4149
+ continue ;
4150
+
4151
+ Value *Ptr = getUnderlyingObject (SI->getPointerOperand ());
4152
+ auto &StoresVec = PtrToStoresMap[Ptr];
4153
+ // For now just keep one store per pointer object per lane.
4154
+ // TODO: Extend this to support multiple stores per pointer per lane
4155
+ if (StoresVec.size () > Lane)
4156
+ continue ;
4157
+ // Skip if in different BBs.
4158
+ if (!StoresVec.empty () &&
4159
+ SI->getParent () != StoresVec.back ()->getParent ())
4160
+ continue ;
4161
+ // Make sure that the stores are of the same type.
4162
+ if (!StoresVec.empty () &&
4163
+ SI->getValueOperand ()->getType () !=
4164
+ StoresVec.back ()->getValueOperand ()->getType ())
4165
+ continue ;
4166
+ StoresVec.push_back (SI);
4167
+ }
4168
+ }
4169
+ return PtrToStoresMap;
4170
+ }
4171
+
4172
+ bool BoUpSLP::CanFormVector (const SmallVector<StoreInst *, 4 > &StoresVec,
4173
+ OrdersType &ReorderIndices) const {
4174
+ // We check whether the stores in StoreVec can form a vector by sorting them
4175
+ // and checking whether they are consecutive.
4176
+
4177
+ // To avoid calling getPointersDiff() while sorting we create a vector of
4178
+ // pairs {store, offset from first} and sort this instead.
4179
+ SmallVector<std::pair<StoreInst *, int >, 4 > StoreOffsetVec (StoresVec.size ());
4180
+ StoreInst *S0 = StoresVec[0 ];
4181
+ StoreOffsetVec[0 ] = {S0, 0 };
4182
+ Type *S0Ty = S0->getValueOperand ()->getType ();
4183
+ Value *S0Ptr = S0->getPointerOperand ();
4184
+ for (unsigned Idx : seq<unsigned >(1 , StoresVec.size ())) {
4185
+ StoreInst *SI = StoresVec[Idx];
4186
+ Optional<int > Diff =
4187
+ getPointersDiff (S0Ty, S0Ptr, SI->getValueOperand ()->getType (),
4188
+ SI->getPointerOperand (), *DL, *SE,
4189
+ /* StrictCheck=*/ true );
4190
+ // We failed to compare the pointers so just abandon this StoresVec.
4191
+ if (!Diff)
4192
+ return false ;
4193
+ StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
4194
+ }
4195
+
4196
+ // Sort the vector based on the pointers. We create a copy because we may
4197
+ // need the original later for calculating the reorder (shuffle) indices.
4198
+ stable_sort (StoreOffsetVec, [](const std::pair<StoreInst *, int > &Pair1,
4199
+ const std::pair<StoreInst *, int > &Pair2) {
4200
+ int Offset1 = Pair1.second ;
4201
+ int Offset2 = Pair2.second ;
4202
+ return Offset1 < Offset2;
4203
+ });
4204
+
4205
+ // Check if the stores are consecutive by checking if last-first == size-1.
4206
+ int LastOffset = StoreOffsetVec.back ().second ;
4207
+ int FirstOffset = StoreOffsetVec.front ().second ;
4208
+ if (LastOffset - FirstOffset != (int )StoreOffsetVec.size () - 1 )
4209
+ return false ;
4210
+
4211
+ // Calculate the shuffle indices according to their offset against the sorted
4212
+ // StoreOffsetVec.
4213
+ ReorderIndices.reserve (StoresVec.size ());
4214
+ for (StoreInst *SI : StoresVec) {
4215
+ unsigned Idx = find_if (StoreOffsetVec,
4216
+ [SI](const std::pair<StoreInst *, int > &Pair) {
4217
+ return Pair.first == SI;
4218
+ }) -
4219
+ StoreOffsetVec.begin ();
4220
+ ReorderIndices.push_back (Idx);
4221
+ }
4222
+ // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
4223
+ // reorderTopToBottom() and reorderBottomToTop(), so we are following the
4224
+ // same convention here.
4225
+ auto IsIdentityOrder = [](const OrdersType &Order) {
4226
+ for (unsigned Idx : seq<unsigned >(0 , Order.size ()))
4227
+ if (Idx != Order[Idx])
4228
+ return false ;
4229
+ return true ;
4230
+ };
4231
+ if (IsIdentityOrder (ReorderIndices))
4232
+ ReorderIndices.clear ();
4233
+
4234
+ return true ;
4235
+ }
4236
+
4237
+ #ifndef NDEBUG
4238
+ LLVM_DUMP_METHOD static void dumpOrder (const BoUpSLP::OrdersType &Order) {
4239
+ for (unsigned Idx : Order)
4240
+ dbgs () << Idx << " , " ;
4241
+ dbgs () << " \n " ;
4242
+ }
4243
+ #endif
4244
+
4245
+ SmallVector<BoUpSLP::OrdersType, 1 >
4246
+ BoUpSLP::findExternalStoreUsersReorderIndices (TreeEntry *TE) const {
4247
+ unsigned NumLanes = TE->Scalars .size ();
4248
+
4249
+ DenseMap<Value *, SmallVector<StoreInst *, 4 >> PtrToStoresMap =
4250
+ collectUserStores (TE);
4251
+
4252
+ // Holds the reorder indices for each candidate store vector that is a user of
4253
+ // the current TreeEntry.
4254
+ SmallVector<OrdersType, 1 > ExternalReorderIndices;
4255
+
4256
+ // Now inspect the stores collected per pointer and look for vectorization
4257
+ // candidates. For each candidate calculate the reorder index vector and push
4258
+ // it into `ExternalReorderIndices`
4259
+ for (const auto &Pair : PtrToStoresMap) {
4260
+ auto &StoresVec = Pair.second ;
4261
+ // If we have fewer than NumLanes stores, then we can't form a vector.
4262
+ if (StoresVec.size () != NumLanes)
4263
+ continue ;
4264
+
4265
+ // If the stores are not consecutive then abandon this StoresVec.
4266
+ OrdersType ReorderIndices;
4267
+ if (!CanFormVector (StoresVec, ReorderIndices))
4268
+ continue ;
4269
+
4270
+ // We now know that the scalars in StoresVec can form a vector instruction,
4271
+ // so set the reorder indices.
4272
+ ExternalReorderIndices.push_back (ReorderIndices);
4273
+ }
4274
+ return ExternalReorderIndices;
4275
+ }
4276
+
4081
4277
void BoUpSLP::buildTree (ArrayRef<Value *> Roots,
4082
4278
ArrayRef<Value *> UserIgnoreLst) {
4083
4279
deleteTree ();
0 commit comments