@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
260
260
VF * getNumElements(ScalarTy));
261
261
}
262
262
263
+ /// Returns the number of elements of the given type \p Ty, not less than \p Sz,
264
+ /// which forms type, which splits by \p TTI into whole vector types during
265
+ /// legalization.
266
+ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
267
+ Type *Ty, unsigned Sz) {
268
+ if (!isValidElementType(Ty))
269
+ return bit_ceil(Sz);
270
+ // Find the number of elements, which forms full vectors.
271
+ const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
272
+ if (NumParts == 0 || NumParts >= Sz)
273
+ return bit_ceil(Sz);
274
+ return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
275
+ }
276
+
263
277
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
264
278
SmallVectorImpl<int> &Mask) {
265
279
// The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -394,7 +408,7 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
394
408
/// total number of elements \p Size and number of registers (parts) \p
395
409
/// NumParts.
396
410
static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
397
- return PowerOf2Ceil( divideCeil(Size, NumParts));
411
+ return std::min<unsigned>(Size, bit_ceil( divideCeil(Size, NumParts) ));
398
412
}
399
413
400
414
/// Returns correct remaining number of elements, considering total amount \p
@@ -1222,6 +1236,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1222
1236
(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
1223
1237
}
1224
1238
1239
+ /// Returns true if widened type of \p Ty elements with size \p Sz represents
1240
+ /// full vector type, i.e. adding extra element results in extra parts upon type
1241
+ /// legalization.
1242
+ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
1243
+ unsigned Sz) {
1244
+ if (Sz <= 1)
1245
+ return false;
1246
+ if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1247
+ return false;
1248
+ if (has_single_bit(Sz))
1249
+ return true;
1250
+ const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1251
+ return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1252
+ Sz % NumParts == 0;
1253
+ }
1254
+
1225
1255
namespace slpvectorizer {
1226
1256
1227
1257
/// Bottom Up SLP Vectorizer.
@@ -3311,6 +3341,15 @@ class BoUpSLP {
3311
3341
/// Return true if this is a non-power-of-2 node.
3312
3342
bool isNonPowOf2Vec() const {
3313
3343
bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3344
+ return IsNonPowerOf2;
3345
+ }
3346
+
3347
+ /// Return true if this is a node, which tries to vectorize number of
3348
+ /// elements, forming whole vectors.
3349
+ bool
3350
+ hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3351
+ bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3352
+ TTI, getValueType(Scalars.front()), Scalars.size());
3314
3353
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3315
3354
"Reshuffling not supported with non-power-of-2 vectors yet.");
3316
3355
return IsNonPowerOf2;
@@ -3430,8 +3469,10 @@ class BoUpSLP {
3430
3469
Last->State = EntryState;
3431
3470
// FIXME: Remove once support for ReuseShuffleIndices has been implemented
3432
3471
// for non-power-of-two vectors.
3433
- assert((has_single_bit(VL.size()) || ReuseShuffleIndices.empty()) &&
3434
- "Reshuffling scalars not yet supported for nodes with padding");
3472
+ assert(
3473
+ (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
3474
+ ReuseShuffleIndices.empty()) &&
3475
+ "Reshuffling scalars not yet supported for nodes with padding");
3435
3476
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3436
3477
ReuseShuffleIndices.end());
3437
3478
if (ReorderIndices.empty()) {
@@ -5269,7 +5310,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5269
5310
// node.
5270
5311
if (!TE.ReuseShuffleIndices.empty()) {
5271
5312
// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5272
- assert(!TE.isNonPowOf2Vec( ) &&
5313
+ assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI ) &&
5273
5314
"Reshuffling scalars not yet supported for nodes with padding");
5274
5315
5275
5316
if (isSplat(TE.Scalars))
@@ -5509,7 +5550,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5509
5550
}
5510
5551
// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5511
5552
// has been auditted for correctness with non-power-of-two vectors.
5512
- if (!TE.isNonPowOf2Vec( ))
5553
+ if (!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI ))
5513
5554
if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5514
5555
return CurrentOrder;
5515
5556
}
@@ -5662,15 +5703,18 @@ void BoUpSLP::reorderTopToBottom() {
5662
5703
});
5663
5704
5664
5705
// Reorder the graph nodes according to their vectorization factor.
5665
- for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5666
- VF = bit_ceil (VF) / 2 ) {
5706
+ for (unsigned VF = VectorizableTree.front()->getVectorFactor();
5707
+ !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U) ) {
5667
5708
auto It = VFToOrderedEntries.find(VF);
5668
5709
if (It == VFToOrderedEntries.end())
5669
5710
continue;
5670
5711
// Try to find the most profitable order. We just are looking for the most
5671
5712
// used order and reorder scalar elements in the nodes according to this
5672
5713
// mostly used order.
5673
5714
ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5715
+ // Delete VF entry upon exit.
5716
+ auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
5717
+
5674
5718
// All operands are reordered and used only in this node - propagate the
5675
5719
// most used order to the user node.
5676
5720
MapVector<OrdersType, unsigned,
@@ -7529,33 +7573,36 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
7529
7573
UniqueValues.emplace_back(V);
7530
7574
}
7531
7575
size_t NumUniqueScalarValues = UniqueValues.size();
7532
- if (NumUniqueScalarValues == VL.size()) {
7576
+ bool IsFullVectors = hasFullVectorsOrPowerOf2(
7577
+ *TTI, UniqueValues.front()->getType(), NumUniqueScalarValues);
7578
+ if (NumUniqueScalarValues == VL.size() &&
7579
+ (VectorizeNonPowerOf2 || IsFullVectors)) {
7533
7580
ReuseShuffleIndices.clear();
7534
7581
} else {
7535
7582
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
7536
- if ((UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) ||
7537
- !llvm::has_single_bit(VL.size())) {
7583
+ if ((UserTreeIdx.UserTE &&
7584
+ UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
7585
+ !has_single_bit(VL.size())) {
7538
7586
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
7539
7587
"for nodes with padding.\n");
7540
7588
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7541
7589
return false;
7542
7590
}
7543
7591
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
7544
- if (NumUniqueScalarValues <= 1 ||
7545
- (UniquePositions.size() == 1 && all_of(UniqueValues,
7546
- [](Value *V) {
7547
- return isa<UndefValue>(V) ||
7548
- !isConstant(V);
7549
- })) ||
7550
- !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
7592
+ if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
7593
+ (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
7594
+ return isa<UndefValue>(V) || !isConstant(V);
7595
+ }))) {
7551
7596
if (DoNotFail && UniquePositions.size() > 1 &&
7552
7597
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
7553
7598
all_of(UniqueValues, [=](Value *V) {
7554
7599
return isa<ExtractElementInst>(V) ||
7555
7600
areAllUsersVectorized(cast<Instruction>(V),
7556
7601
UserIgnoreList);
7557
7602
})) {
7558
- unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
7603
+ // Find the number of elements, which forms full vectors.
7604
+ unsigned PWSz = getFullVectorNumberOfElements(
7605
+ *TTI, UniqueValues.front()->getType(), UniqueValues.size());
7559
7606
if (PWSz == VL.size()) {
7560
7607
ReuseShuffleIndices.clear();
7561
7608
} else {
@@ -9793,9 +9840,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
9793
9840
return nullptr;
9794
9841
Value *VecBase = nullptr;
9795
9842
ArrayRef<Value *> VL = E->Scalars;
9796
- // If the resulting type is scalarized, do not adjust the cost.
9797
- if (NumParts == VL.size())
9798
- return nullptr;
9799
9843
// Check if it can be considered reused if same extractelements were
9800
9844
// vectorized already.
9801
9845
bool PrevNodeFound = any_of(
@@ -10450,7 +10494,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
10450
10494
InsertMask[Idx] = I + 1;
10451
10495
}
10452
10496
unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
10453
- if (NumOfParts > 0)
10497
+ if (NumOfParts > 0 && NumOfParts < NumElts )
10454
10498
VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
10455
10499
unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
10456
10500
VecScalarsSz;
@@ -17785,7 +17829,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
17785
17829
for (unsigned I = NextInst; I < MaxInst; ++I) {
17786
17830
unsigned ActualVF = std::min(MaxInst - I, VF);
17787
17831
17788
- if (!has_single_bit( ActualVF))
17832
+ if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
17789
17833
continue;
17790
17834
17791
17835
if (MaxVFOnly && ActualVF < MaxVF)
0 commit comments