Skip to content

Commit a65a5fe

Browse files
[SLP]Improve masked loads vectorization, attempting gathered loads
If the vector of loads can be vectorized as masked gather and there are several other masked gather nodes, compiler can try to attempt to check, if it possible to gather such nodes into big consecutive/strided loads node, which provide better performance. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #110151
1 parent 36fc291 commit a65a5fe

File tree

6 files changed

+502
-411
lines changed

6 files changed

+502
-411
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 117 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1371,6 +1371,8 @@ class BoUpSLP {
13711371
MustGather.clear();
13721372
NonScheduledFirst.clear();
13731373
EntryToLastInstruction.clear();
1374+
LoadEntriesToVectorize.clear();
1375+
IsGraphTransformMode = false;
13741376
GatheredLoadsEntriesFirst.reset();
13751377
ExternalUses.clear();
13761378
ExternalUsesAsOriginalScalar.clear();
@@ -3613,6 +3615,14 @@ class BoUpSLP {
36133615
DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
36143616
ValueToGatherNodesMap ValueToGatherNodes;
36153617

3618+
/// A list of the load entries (node indices), which can be vectorized using
3619+
/// strided or masked gather approach, but attempted to be represented as
3620+
/// contiguous loads.
3621+
SetVector<unsigned> LoadEntriesToVectorize;
3622+
3623+
/// true if graph nodes transforming mode is on.
3624+
bool IsGraphTransformMode = false;
3625+
36163626
/// The index of the first gathered load entry in the VectorizeTree.
36173627
std::optional<unsigned> GatheredLoadsEntriesFirst;
36183628

@@ -4618,17 +4628,15 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
46184628
if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
46194629
return false;
46204630
auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4621-
if (!GEP1)
4622-
return false;
46234631
auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4624-
if (!GEP2)
4625-
return false;
4626-
return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4627-
((isConstant(GEP1->getOperand(1)) &&
4628-
isConstant(GEP2->getOperand(1))) ||
4632+
return (!GEP1 || GEP1->getNumOperands() == 2) &&
4633+
(!GEP2 || GEP2->getNumOperands() == 2) &&
4634+
(((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4635+
(!GEP2 || isConstant(GEP2->getOperand(1)))) ||
46294636
!CompareOpcodes ||
4630-
getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4631-
.getOpcode());
4637+
(GEP1 && GEP2 &&
4638+
getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4639+
.getOpcode()));
46324640
}
46334641

46344642
/// Calculates minimal alignment as a common alignment.
@@ -5118,9 +5126,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
51185126
L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
51195127
return L->isLoopInvariant(V);
51205128
})) <= Sz / 2;
5121-
if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
5129+
if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
51225130
auto *GEP = dyn_cast<GetElementPtrInst>(P);
5123-
return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
5131+
return (!GEP && doesNotNeedToBeScheduled(P)) ||
51245132
(GEP && GEP->getNumOperands() == 2 &&
51255133
isa<Constant, Instruction>(GEP->getOperand(1)));
51265134
})) {
@@ -6667,6 +6675,12 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
66676675
ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
66686676
GatheredLoadsEntriesFirst = VectorizableTree.size();
66696677

6678+
SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6679+
LoadEntriesToVectorize.size());
6680+
for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6681+
Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6682+
VectorizableTree[Idx]->Scalars.end());
6683+
66706684
// Sort loads by distance.
66716685
auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
66726686
const std::pair<LoadInst *, int> &L2) {
@@ -6924,8 +6938,42 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69246938
}
69256939
}
69266940
}
6941+
// Cannot represent the loads as consecutive vectorizable nodes -
6942+
// just exit.
6943+
unsigned ConsecutiveNodesSize = 0;
6944+
if (!LoadEntriesToVectorize.empty() &&
6945+
any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
6946+
[&, Slice = Slice](const auto &P) {
6947+
const auto *It = find_if(Slice, [&](Value *V) {
6948+
return std::get<1>(P).contains(V);
6949+
});
6950+
if (It == Slice.end())
6951+
return false;
6952+
ArrayRef<Value *> VL =
6953+
VectorizableTree[std::get<0>(P)]->Scalars;
6954+
ConsecutiveNodesSize += VL.size();
6955+
unsigned Start = std::distance(Slice.begin(), It);
6956+
unsigned Sz = Slice.size() - Start;
6957+
return Sz < VL.size() ||
6958+
Slice.slice(std::distance(Slice.begin(), It),
6959+
VL.size()) != VL;
6960+
}))
6961+
continue;
69276962
// Try to build long masked gather loads.
69286963
UserMaxVF = bit_ceil(UserMaxVF);
6964+
if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
6965+
[&, Slice = Slice](unsigned Idx) {
6966+
OrdersType Order;
6967+
SmallVector<Value *> PointerOps;
6968+
return canVectorizeLoads(
6969+
Slice.slice(Idx * UserMaxVF, UserMaxVF),
6970+
Slice[Idx * UserMaxVF], Order,
6971+
PointerOps) ==
6972+
LoadsState::ScatterVectorize;
6973+
}))
6974+
UserMaxVF = MaxVF;
6975+
if (Slice.size() != ConsecutiveNodesSize)
6976+
MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
69296977
}
69306978
for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
69316979
bool IsVectorized = true;
@@ -6934,6 +6982,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69346982
Slice.slice(I, std::min(VF, E - I));
69356983
if (getTreeEntry(SubSlice.front()))
69366984
continue;
6985+
// Check if the subslice is to be-vectorized entry, which is not
6986+
// equal to entry.
6987+
if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
6988+
[&](const auto &P) {
6989+
return !SubSlice.equals(
6990+
VectorizableTree[std::get<0>(P)]
6991+
->Scalars) &&
6992+
set_is_subset(SubSlice, std::get<1>(P));
6993+
}))
6994+
continue;
69376995
unsigned Sz = VectorizableTree.size();
69386996
buildTree_rec(SubSlice, 0, EdgeInfo());
69396997
if (Sz == VectorizableTree.size()) {
@@ -6968,6 +7026,20 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69687026
// Final attempt to vectorize non-vectorized loads.
69697027
(void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
69707028
}
7029+
// Try to vectorize postponed load entries, previously marked as gathered.
7030+
for (unsigned Idx : LoadEntriesToVectorize) {
7031+
const TreeEntry &E = *VectorizableTree[Idx];
7032+
SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7033+
// Avoid reordering, if possible.
7034+
if (!E.ReorderIndices.empty()) {
7035+
// Build a mask out of the reorder indices and reorder scalars per this
7036+
// mask.
7037+
SmallVector<int> ReorderMask;
7038+
inversePermutation(E.ReorderIndices, ReorderMask);
7039+
reorderScalars(GatheredScalars, ReorderMask);
7040+
}
7041+
buildTree_rec(GatheredScalars, 0, EdgeInfo());
7042+
}
69717043
// If no new entries created, consider it as no gathered loads entries must be
69727044
// handled.
69737045
if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
@@ -7280,6 +7352,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
72807352
case LoadsState::Vectorize:
72817353
return TreeEntry::Vectorize;
72827354
case LoadsState::ScatterVectorize:
7355+
if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7356+
// Delay slow vectorized nodes for better vectorization attempts.
7357+
LoadEntriesToVectorize.insert(VectorizableTree.size());
7358+
return TreeEntry::NeedToGather;
7359+
}
72837360
return TreeEntry::ScatterVectorize;
72847361
case LoadsState::StridedVectorize:
72857362
return TreeEntry::StridedVectorize;
@@ -9117,6 +9194,17 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
91179194
void BoUpSLP::transformNodes() {
91189195
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
91199196
BaseGraphSize = VectorizableTree.size();
9197+
// Turn graph transforming mode on and off, when done.
9198+
class GraphTransformModeRAAI {
9199+
bool &SavedIsGraphTransformMode;
9200+
9201+
public:
9202+
GraphTransformModeRAAI(bool &IsGraphTransformMode)
9203+
: SavedIsGraphTransformMode(IsGraphTransformMode) {
9204+
IsGraphTransformMode = true;
9205+
}
9206+
~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9207+
} TransformContext(IsGraphTransformMode);
91209208
// Operands are profitable if they are:
91219209
// 1. At least one constant
91229210
// or
@@ -9149,7 +9237,7 @@ void BoUpSLP::transformNodes() {
91499237
unsigned MinVF = getMinVF(2 * Sz);
91509238
// Do not try partial vectorization for small nodes (<= 2), nodes with the
91519239
// same opcode and same parent block or all constants.
9152-
if (VL.size() <= 2 ||
9240+
if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
91539241
!(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
91549242
E.isAltShuffle() || !allSameBlock(VL)) ||
91559243
allConstant(VL) || isSplat(VL))
@@ -9248,13 +9336,17 @@ void BoUpSLP::transformNodes() {
92489336
continue;
92499337
}
92509338
unsigned PrevSize = VectorizableTree.size();
9339+
[[maybe_unused]] unsigned PrevEntriesSize =
9340+
LoadEntriesToVectorize.size();
92519341
buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
92529342
if (PrevSize + 1 == VectorizableTree.size() &&
92539343
VectorizableTree[PrevSize]->isGather() &&
92549344
VectorizableTree[PrevSize]->getOpcode() !=
92559345
Instruction::ExtractElement &&
92569346
!isSplat(Slice)) {
92579347
VectorizableTree.pop_back();
9348+
assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9349+
"LoadEntriesToVectorize expected to remain the same");
92589350
continue;
92599351
}
92609352
AddCombinedNode(PrevSize, Cnt);
@@ -9340,17 +9432,19 @@ void BoUpSLP::transformNodes() {
93409432
}
93419433
}
93429434

9343-
// Single load node - exit.
9344-
if (VectorizableTree.size() <= 1 &&
9345-
VectorizableTree.front()->getOpcode() == Instruction::Load)
9346-
return;
9347-
// Small graph with small VF - exit.
9348-
constexpr unsigned SmallTree = 3;
9349-
constexpr unsigned SmallVF = 2;
9350-
if ((VectorizableTree.size() <= SmallTree &&
9351-
VectorizableTree.front()->Scalars.size() == SmallVF) ||
9352-
(VectorizableTree.size() <= 2 && UserIgnoreList))
9353-
return;
9435+
if (LoadEntriesToVectorize.empty()) {
9436+
// Single load node - exit.
9437+
if (VectorizableTree.size() <= 1 &&
9438+
VectorizableTree.front()->getOpcode() == Instruction::Load)
9439+
return;
9440+
// Small graph with small VF - exit.
9441+
constexpr unsigned SmallTree = 3;
9442+
constexpr unsigned SmallVF = 2;
9443+
if ((VectorizableTree.size() <= SmallTree &&
9444+
VectorizableTree.front()->Scalars.size() == SmallVF) ||
9445+
(VectorizableTree.size() <= 2 && UserIgnoreList))
9446+
return;
9447+
}
93549448

93559449
// A list of loads to be gathered during the vectorization process. We can
93569450
// try to vectorize them at the end, if profitable.

0 commit comments

Comments
 (0)