@@ -1371,6 +1371,8 @@ class BoUpSLP {
1371
1371
MustGather.clear();
1372
1372
NonScheduledFirst.clear();
1373
1373
EntryToLastInstruction.clear();
1374
+ LoadEntriesToVectorize.clear();
1375
+ IsGraphTransformMode = false;
1374
1376
GatheredLoadsEntriesFirst.reset();
1375
1377
ExternalUses.clear();
1376
1378
ExternalUsesAsOriginalScalar.clear();
@@ -3613,6 +3615,14 @@ class BoUpSLP {
3613
3615
DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
3614
3616
ValueToGatherNodesMap ValueToGatherNodes;
3615
3617
3618
+ /// A list of the load entries (node indices), which can be vectorized using
3619
+ /// strided or masked gather approach, but attempted to be represented as
3620
+ /// contiguous loads.
3621
+ SetVector<unsigned> LoadEntriesToVectorize;
3622
+
3623
+ /// true if graph nodes transforming mode is on.
3624
+ bool IsGraphTransformMode = false;
3625
+
3616
3626
/// The index of the first gathered load entry in the VectorizeTree.
3617
3627
std::optional<unsigned> GatheredLoadsEntriesFirst;
3618
3628
@@ -4618,17 +4628,15 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4618
4628
if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4619
4629
return false;
4620
4630
auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4621
- if (!GEP1)
4622
- return false;
4623
4631
auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4624
- if (!GEP2)
4625
- return false;
4626
- return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4627
- ((isConstant(GEP1->getOperand(1)) &&
4628
- isConstant(GEP2->getOperand(1))) ||
4632
+ return (!GEP1 || GEP1->getNumOperands() == 2) &&
4633
+ (!GEP2 || GEP2->getNumOperands() == 2) &&
4634
+ (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4635
+ (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
4629
4636
!CompareOpcodes ||
4630
- getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4631
- .getOpcode());
4637
+ (GEP1 && GEP2 &&
4638
+ getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4639
+ .getOpcode()));
4632
4640
}
4633
4641
4634
4642
/// Calculates minimal alignment as a common alignment.
@@ -5118,9 +5126,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
5118
5126
L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5119
5127
return L->isLoopInvariant(V);
5120
5128
})) <= Sz / 2;
5121
- if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted ](Value *P) {
5129
+ if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
5122
5130
auto *GEP = dyn_cast<GetElementPtrInst>(P);
5123
- return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
5131
+ return (!GEP && doesNotNeedToBeScheduled(P)) ||
5124
5132
(GEP && GEP->getNumOperands() == 2 &&
5125
5133
isa<Constant, Instruction>(GEP->getOperand(1)));
5126
5134
})) {
@@ -6667,6 +6675,12 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
6667
6675
ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
6668
6676
GatheredLoadsEntriesFirst = VectorizableTree.size();
6669
6677
6678
+ SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6679
+ LoadEntriesToVectorize.size());
6680
+ for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6681
+ Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6682
+ VectorizableTree[Idx]->Scalars.end());
6683
+
6670
6684
// Sort loads by distance.
6671
6685
auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6672
6686
const std::pair<LoadInst *, int> &L2) {
@@ -6924,8 +6938,42 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
6924
6938
}
6925
6939
}
6926
6940
}
6941
+ // Cannot represent the loads as consecutive vectorizable nodes -
6942
+ // just exit.
6943
+ unsigned ConsecutiveNodesSize = 0;
6944
+ if (!LoadEntriesToVectorize.empty() &&
6945
+ any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
6946
+ [&, Slice = Slice](const auto &P) {
6947
+ const auto *It = find_if(Slice, [&](Value *V) {
6948
+ return std::get<1>(P).contains(V);
6949
+ });
6950
+ if (It == Slice.end())
6951
+ return false;
6952
+ ArrayRef<Value *> VL =
6953
+ VectorizableTree[std::get<0>(P)]->Scalars;
6954
+ ConsecutiveNodesSize += VL.size();
6955
+ unsigned Start = std::distance(Slice.begin(), It);
6956
+ unsigned Sz = Slice.size() - Start;
6957
+ return Sz < VL.size() ||
6958
+ Slice.slice(std::distance(Slice.begin(), It),
6959
+ VL.size()) != VL;
6960
+ }))
6961
+ continue;
6927
6962
// Try to build long masked gather loads.
6928
6963
UserMaxVF = bit_ceil(UserMaxVF);
6964
+ if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
6965
+ [&, Slice = Slice](unsigned Idx) {
6966
+ OrdersType Order;
6967
+ SmallVector<Value *> PointerOps;
6968
+ return canVectorizeLoads(
6969
+ Slice.slice(Idx * UserMaxVF, UserMaxVF),
6970
+ Slice[Idx * UserMaxVF], Order,
6971
+ PointerOps) ==
6972
+ LoadsState::ScatterVectorize;
6973
+ }))
6974
+ UserMaxVF = MaxVF;
6975
+ if (Slice.size() != ConsecutiveNodesSize)
6976
+ MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
6929
6977
}
6930
6978
for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
6931
6979
bool IsVectorized = true;
@@ -6934,6 +6982,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
6934
6982
Slice.slice(I, std::min(VF, E - I));
6935
6983
if (getTreeEntry(SubSlice.front()))
6936
6984
continue;
6985
+ // Check if the subslice is to be-vectorized entry, which is not
6986
+ // equal to entry.
6987
+ if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
6988
+ [&](const auto &P) {
6989
+ return !SubSlice.equals(
6990
+ VectorizableTree[std::get<0>(P)]
6991
+ ->Scalars) &&
6992
+ set_is_subset(SubSlice, std::get<1>(P));
6993
+ }))
6994
+ continue;
6937
6995
unsigned Sz = VectorizableTree.size();
6938
6996
buildTree_rec(SubSlice, 0, EdgeInfo());
6939
6997
if (Sz == VectorizableTree.size()) {
@@ -6968,6 +7026,20 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
6968
7026
// Final attempt to vectorize non-vectorized loads.
6969
7027
(void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
6970
7028
}
7029
+ // Try to vectorize postponed load entries, previously marked as gathered.
7030
+ for (unsigned Idx : LoadEntriesToVectorize) {
7031
+ const TreeEntry &E = *VectorizableTree[Idx];
7032
+ SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7033
+ // Avoid reordering, if possible.
7034
+ if (!E.ReorderIndices.empty()) {
7035
+ // Build a mask out of the reorder indices and reorder scalars per this
7036
+ // mask.
7037
+ SmallVector<int> ReorderMask;
7038
+ inversePermutation(E.ReorderIndices, ReorderMask);
7039
+ reorderScalars(GatheredScalars, ReorderMask);
7040
+ }
7041
+ buildTree_rec(GatheredScalars, 0, EdgeInfo());
7042
+ }
6971
7043
// If no new entries created, consider it as no gathered loads entries must be
6972
7044
// handled.
6973
7045
if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
@@ -7280,6 +7352,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7280
7352
case LoadsState::Vectorize:
7281
7353
return TreeEntry::Vectorize;
7282
7354
case LoadsState::ScatterVectorize:
7355
+ if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7356
+ // Delay slow vectorized nodes for better vectorization attempts.
7357
+ LoadEntriesToVectorize.insert(VectorizableTree.size());
7358
+ return TreeEntry::NeedToGather;
7359
+ }
7283
7360
return TreeEntry::ScatterVectorize;
7284
7361
case LoadsState::StridedVectorize:
7285
7362
return TreeEntry::StridedVectorize;
@@ -9117,6 +9194,17 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
9117
9194
void BoUpSLP::transformNodes() {
9118
9195
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9119
9196
BaseGraphSize = VectorizableTree.size();
9197
+ // Turn graph transforming mode on and off, when done.
9198
+ class GraphTransformModeRAAI {
9199
+ bool &SavedIsGraphTransformMode;
9200
+
9201
+ public:
9202
+ GraphTransformModeRAAI(bool &IsGraphTransformMode)
9203
+ : SavedIsGraphTransformMode(IsGraphTransformMode) {
9204
+ IsGraphTransformMode = true;
9205
+ }
9206
+ ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9207
+ } TransformContext(IsGraphTransformMode);
9120
9208
// Operands are profitable if they are:
9121
9209
// 1. At least one constant
9122
9210
// or
@@ -9149,7 +9237,7 @@ void BoUpSLP::transformNodes() {
9149
9237
unsigned MinVF = getMinVF(2 * Sz);
9150
9238
// Do not try partial vectorization for small nodes (<= 2), nodes with the
9151
9239
// same opcode and same parent block or all constants.
9152
- if (VL.size() <= 2 ||
9240
+ if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9153
9241
!(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9154
9242
E.isAltShuffle() || !allSameBlock(VL)) ||
9155
9243
allConstant(VL) || isSplat(VL))
@@ -9248,13 +9336,17 @@ void BoUpSLP::transformNodes() {
9248
9336
continue;
9249
9337
}
9250
9338
unsigned PrevSize = VectorizableTree.size();
9339
+ [[maybe_unused]] unsigned PrevEntriesSize =
9340
+ LoadEntriesToVectorize.size();
9251
9341
buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9252
9342
if (PrevSize + 1 == VectorizableTree.size() &&
9253
9343
VectorizableTree[PrevSize]->isGather() &&
9254
9344
VectorizableTree[PrevSize]->getOpcode() !=
9255
9345
Instruction::ExtractElement &&
9256
9346
!isSplat(Slice)) {
9257
9347
VectorizableTree.pop_back();
9348
+ assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9349
+ "LoadEntriesToVectorize expected to remain the same");
9258
9350
continue;
9259
9351
}
9260
9352
AddCombinedNode(PrevSize, Cnt);
@@ -9340,17 +9432,19 @@ void BoUpSLP::transformNodes() {
9340
9432
}
9341
9433
}
9342
9434
9343
- // Single load node - exit.
9344
- if (VectorizableTree.size() <= 1 &&
9345
- VectorizableTree.front()->getOpcode() == Instruction::Load)
9346
- return;
9347
- // Small graph with small VF - exit.
9348
- constexpr unsigned SmallTree = 3;
9349
- constexpr unsigned SmallVF = 2;
9350
- if ((VectorizableTree.size() <= SmallTree &&
9351
- VectorizableTree.front()->Scalars.size() == SmallVF) ||
9352
- (VectorizableTree.size() <= 2 && UserIgnoreList))
9353
- return;
9435
+ if (LoadEntriesToVectorize.empty()) {
9436
+ // Single load node - exit.
9437
+ if (VectorizableTree.size() <= 1 &&
9438
+ VectorizableTree.front()->getOpcode() == Instruction::Load)
9439
+ return;
9440
+ // Small graph with small VF - exit.
9441
+ constexpr unsigned SmallTree = 3;
9442
+ constexpr unsigned SmallVF = 2;
9443
+ if ((VectorizableTree.size() <= SmallTree &&
9444
+ VectorizableTree.front()->Scalars.size() == SmallVF) ||
9445
+ (VectorizableTree.size() <= 2 && UserIgnoreList))
9446
+ return;
9447
+ }
9354
9448
9355
9449
// A list of loads to be gathered during the vectorization process. We can
9356
9450
// try to vectorize them at the end, if profitable.
0 commit comments