@@ -2922,7 +2922,7 @@ class BoUpSLP {
2922
2922
2923
2923
/// This is the recursive part of buildTree.
2924
2924
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2925
- const EdgeInfo &EI);
2925
+ const EdgeInfo &EI, unsigned InterleaveFactor = 0 );
2926
2926
2927
2927
/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2928
2928
/// be vectorized to use the original vector (or aggregate "bitcast" to a
@@ -3226,7 +3226,15 @@ class BoUpSLP {
3226
3226
Instruction *MainOp = nullptr;
3227
3227
Instruction *AltOp = nullptr;
3228
3228
3229
+ /// Interleaving factor for interleaved loads Vectorize nodes.
3230
+ unsigned InterleaveFactor = 0;
3231
+
3229
3232
public:
3233
+ /// Returns interleave factor for interleave nodes.
3234
+ unsigned getInterleaveFactor() const { return InterleaveFactor; }
3235
+ /// Sets interleaving factor for the interleaving nodes.
3236
+ void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3237
+
3230
3238
/// Set this bundle's \p OpIdx'th operand to \p OpVL.
3231
3239
void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3232
3240
if (Operands.size() < OpIdx + 1)
@@ -3390,7 +3398,12 @@ class BoUpSLP {
3390
3398
dbgs() << "State: ";
3391
3399
switch (State) {
3392
3400
case Vectorize:
3393
- dbgs() << "Vectorize\n";
3401
+ if (InterleaveFactor > 0) {
3402
+ dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3403
+ << "\n";
3404
+ } else {
3405
+ dbgs() << "Vectorize\n";
3406
+ }
3394
3407
break;
3395
3408
case ScatterVectorize:
3396
3409
dbgs() << "ScatterVectorize\n";
@@ -3460,11 +3473,15 @@ class BoUpSLP {
3460
3473
const InstructionsState &S,
3461
3474
const EdgeInfo &UserTreeIdx,
3462
3475
ArrayRef<int> ReuseShuffleIndices = {},
3463
- ArrayRef<unsigned> ReorderIndices = {}) {
3476
+ ArrayRef<unsigned> ReorderIndices = {},
3477
+ unsigned InterleaveFactor = 0) {
3464
3478
TreeEntry::EntryState EntryState =
3465
3479
Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3466
- return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3467
- ReuseShuffleIndices, ReorderIndices);
3480
+ TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3481
+ ReuseShuffleIndices, ReorderIndices);
3482
+ if (E && InterleaveFactor > 0)
3483
+ E->setInterleave(InterleaveFactor);
3484
+ return E;
3468
3485
}
3469
3486
3470
3487
TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
@@ -6849,7 +6866,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
6849
6866
return Results;
6850
6867
};
6851
6868
auto ProcessGatheredLoads =
6852
- [&](ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,
6869
+ [&, &TTI = *TTI](
6870
+ ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,
6853
6871
bool Final = false) {
6854
6872
SmallVector<LoadInst *> NonVectorized;
6855
6873
for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
@@ -6932,11 +6950,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
6932
6950
// distance between scalar loads in these nodes.
6933
6951
unsigned MaxVF = Slice.size();
6934
6952
unsigned UserMaxVF = 0;
6953
+ unsigned InterleaveFactor = 0;
6935
6954
if (MaxVF == 2) {
6936
6955
UserMaxVF = MaxVF;
6937
6956
} else {
6957
+ // Found distance between segments of the interleaved loads.
6958
+ std::optional<unsigned> InterleavedLoadsDistance = 0;
6959
+ unsigned Order = 0;
6938
6960
std::optional<unsigned> CommonVF = 0;
6939
6961
DenseMap<const TreeEntry *, unsigned> EntryToPosition;
6962
+ SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
6940
6963
for (auto [Idx, V] : enumerate(Slice)) {
6941
6964
for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
6942
6965
UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
@@ -6951,12 +6974,59 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
6951
6974
if (*CommonVF != E->Scalars.size())
6952
6975
CommonVF.reset();
6953
6976
}
6977
+ // Check if the load is the part of the interleaved load.
6978
+ if (Pos != Idx && InterleavedLoadsDistance) {
6979
+ if (!DeinterleavedNodes.contains(E) &&
6980
+ any_of(E->Scalars, [&, Slice = Slice](Value *V) {
6981
+ if (isa<Constant>(V))
6982
+ return false;
6983
+ if (getTreeEntry(V))
6984
+ return true;
6985
+ const auto &Nodes = ValueToGatherNodes.at(V);
6986
+ return (Nodes.size() != 1 || !Nodes.contains(E)) &&
6987
+ !is_contained(Slice, V);
6988
+ })) {
6989
+ InterleavedLoadsDistance.reset();
6990
+ continue;
6991
+ }
6992
+ DeinterleavedNodes.insert(E);
6993
+ if (*InterleavedLoadsDistance == 0) {
6994
+ InterleavedLoadsDistance = Idx - Pos;
6995
+ continue;
6996
+ }
6997
+ if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
6998
+ (Idx - Pos) / *InterleavedLoadsDistance < Order)
6999
+ InterleavedLoadsDistance.reset();
7000
+ Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7001
+ }
7002
+ }
7003
+ }
7004
+ DeinterleavedNodes.clear();
7005
+ // Check if the large load represents interleaved load operation.
7006
+ if (InterleavedLoadsDistance.value_or(0) > 1 &&
7007
+ CommonVF.value_or(0) != 0) {
7008
+ InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7009
+ unsigned VF = *CommonVF;
7010
+ OrdersType Order;
7011
+ SmallVector<Value *> PointerOps;
7012
+ // Segmented load detected - vectorize at maximum vector factor.
7013
+ if (TTI.isLegalInterleavedAccessType(
7014
+ getWidenedType(Slice.front()->getType(), VF),
7015
+ InterleaveFactor,
7016
+ cast<LoadInst>(Slice.front())->getAlign(),
7017
+ cast<LoadInst>(Slice.front())
7018
+ ->getPointerAddressSpace()) &&
7019
+ canVectorizeLoads(Slice, Slice.front(), Order,
7020
+ PointerOps) == LoadsState::Vectorize) {
7021
+ UserMaxVF = InterleaveFactor * VF;
7022
+ } else {
7023
+ InterleaveFactor = 0;
6954
7024
}
6955
7025
}
6956
7026
// Cannot represent the loads as consecutive vectorizable nodes -
6957
7027
// just exit.
6958
7028
unsigned ConsecutiveNodesSize = 0;
6959
- if (!LoadEntriesToVectorize.empty() &&
7029
+ if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
6960
7030
any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
6961
7031
[&, Slice = Slice](const auto &P) {
6962
7032
const auto *It = find_if(Slice, [&](Value *V) {
@@ -6976,7 +7046,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
6976
7046
continue;
6977
7047
// Try to build long masked gather loads.
6978
7048
UserMaxVF = bit_ceil(UserMaxVF);
6979
- if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7049
+ if (InterleaveFactor == 0 &&
7050
+ any_of(seq<unsigned>(Slice.size() / UserMaxVF),
6980
7051
[&, Slice = Slice](unsigned Idx) {
6981
7052
OrdersType Order;
6982
7053
SmallVector<Value *> PointerOps;
@@ -7008,9 +7079,15 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
7008
7079
}))
7009
7080
continue;
7010
7081
unsigned Sz = VectorizableTree.size();
7011
- buildTree_rec(SubSlice, 0, EdgeInfo());
7082
+ buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor );
7012
7083
if (Sz == VectorizableTree.size()) {
7013
7084
IsVectorized = false;
7085
+ // Try non-interleaved vectorization with smaller vector
7086
+ // factor.
7087
+ if (InterleaveFactor > 0) {
7088
+ VF = 2 * (MaxVF / InterleaveFactor);
7089
+ InterleaveFactor = 0;
7090
+ }
7014
7091
continue;
7015
7092
}
7016
7093
}
@@ -7374,6 +7451,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7374
7451
}
7375
7452
return TreeEntry::ScatterVectorize;
7376
7453
case LoadsState::StridedVectorize:
7454
+ if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7455
+ // Delay slow vectorized nodes for better vectorization attempts.
7456
+ LoadEntriesToVectorize.insert(VectorizableTree.size());
7457
+ return TreeEntry::NeedToGather;
7458
+ }
7377
7459
return TreeEntry::StridedVectorize;
7378
7460
case LoadsState::Gather:
7379
7461
#ifndef NDEBUG
@@ -7707,7 +7789,8 @@ class PHIHandler {
7707
7789
} // namespace
7708
7790
7709
7791
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
7710
- const EdgeInfo &UserTreeIdx) {
7792
+ const EdgeInfo &UserTreeIdx,
7793
+ unsigned InterleaveFactor) {
7711
7794
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
7712
7795
7713
7796
SmallVector<int> ReuseShuffleIndices;
@@ -8185,7 +8268,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
8185
8268
switch (State) {
8186
8269
case TreeEntry::Vectorize:
8187
8270
TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8188
- ReuseShuffleIndices, CurrentOrder);
8271
+ ReuseShuffleIndices, CurrentOrder, InterleaveFactor );
8189
8272
if (CurrentOrder.empty())
8190
8273
LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
8191
8274
else
@@ -9895,6 +9978,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
9895
9978
Idx = EMask[Idx];
9896
9979
}
9897
9980
CommonVF = E->Scalars.size();
9981
+ } else if (std::optional<unsigned> Factor = E->getInterleaveFactor();
9982
+ Factor && E->Scalars.size() != Mask.size() &&
9983
+ ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
9984
+ *Factor)) {
9985
+ // Deinterleaved nodes are free.
9986
+ std::iota(CommonMask.begin(), CommonMask.end(), 0);
9898
9987
}
9899
9988
ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
9900
9989
V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
@@ -10968,23 +11057,38 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
10968
11057
auto *LI0 = cast<LoadInst>(VL0);
10969
11058
auto GetVectorCost = [&](InstructionCost CommonCost) {
10970
11059
InstructionCost VecLdCost;
10971
- if (E->State == TreeEntry::Vectorize) {
10972
- VecLdCost = TTI->getMemoryOpCost(
10973
- Instruction::Load, VecTy, LI0->getAlign(),
10974
- LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
10975
- } else if (E->State == TreeEntry::StridedVectorize) {
11060
+ switch (E->State) {
11061
+ case TreeEntry::Vectorize:
11062
+ if (unsigned Factor = E->getInterleaveFactor()) {
11063
+ VecLdCost = TTI->getInterleavedMemoryOpCost(
11064
+ Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11065
+ LI0->getPointerAddressSpace(), CostKind);
11066
+
11067
+ } else {
11068
+ VecLdCost = TTI->getMemoryOpCost(
11069
+ Instruction::Load, VecTy, LI0->getAlign(),
11070
+ LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11071
+ }
11072
+ break;
11073
+ case TreeEntry::StridedVectorize: {
10976
11074
Align CommonAlignment =
10977
11075
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
10978
11076
VecLdCost = TTI->getStridedMemoryOpCost(
10979
11077
Instruction::Load, VecTy, LI0->getPointerOperand(),
10980
11078
/*VariableMask=*/false, CommonAlignment, CostKind);
10981
- } else {
10982
- assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
11079
+ break;
11080
+ }
11081
+ case TreeEntry::ScatterVectorize: {
10983
11082
Align CommonAlignment =
10984
11083
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
10985
11084
VecLdCost = TTI->getGatherScatterOpCost(
10986
11085
Instruction::Load, VecTy, LI0->getPointerOperand(),
10987
11086
/*VariableMask=*/false, CommonAlignment, CostKind);
11087
+ break;
11088
+ }
11089
+ case TreeEntry::CombinedVectorize:
11090
+ case TreeEntry::NeedToGather:
11091
+ llvm_unreachable("Unexpected vectorization state.");
10988
11092
}
10989
11093
return VecLdCost + CommonCost;
10990
11094
};
@@ -11397,6 +11501,11 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
11397
11501
}))
11398
11502
return false;
11399
11503
11504
+ if (VectorizableTree.back()->isGather() &&
11505
+ VectorizableTree.back()->isAltShuffle() &&
11506
+ VectorizableTree.back()->getVectorFactor() > 2)
11507
+ return false;
11508
+
11400
11509
assert(VectorizableTree.empty()
11401
11510
? ExternalUses.empty()
11402
11511
: true && "We shouldn't have any external users");
0 commit comments