@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
260
260
VF * getNumElements(ScalarTy));
261
261
}
262
262
263
+ /// Returns the number of elements of the given type \p Ty, not less than \p Sz,
264
+ /// which forms type, which splits by \p TTI into whole vector types during
265
+ /// legalization.
266
+ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
267
+ Type *Ty, unsigned Sz) {
268
+ if (!isValidElementType(Ty))
269
+ return bit_ceil(Sz);
270
+ // Find the number of elements, which forms full vectors.
271
+ const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
272
+ if (NumParts == 0 || NumParts >= Sz)
273
+ return bit_ceil(Sz);
274
+ return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
275
+ }
276
+
263
277
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
264
278
SmallVectorImpl<int> &Mask) {
265
279
// The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -394,7 +408,7 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
394
408
/// total number of elements \p Size and number of registers (parts) \p
395
409
/// NumParts.
396
410
static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
397
- return PowerOf2Ceil( divideCeil(Size, NumParts));
411
+ return std::min<unsigned>(Size, bit_ceil( divideCeil(Size, NumParts) ));
398
412
}
399
413
400
414
/// Returns correct remaining number of elements, considering total amount \p
@@ -1222,6 +1236,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1222
1236
(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
1223
1237
}
1224
1238
1239
+ /// Returns true if widened type of \p Ty elements with size \p Sz represents
1240
+ /// full vector type, i.e. adding extra element results in extra parts upon type
1241
+ /// legalization.
1242
+ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
1243
+ unsigned Sz) {
1244
+ if (Sz <= 1)
1245
+ return false;
1246
+ if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1247
+ return false;
1248
+ if (has_single_bit(Sz))
1249
+ return true;
1250
+ const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1251
+ return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1252
+ Sz % NumParts == 0;
1253
+ }
1254
+
1225
1255
namespace slpvectorizer {
1226
1256
1227
1257
/// Bottom Up SLP Vectorizer.
@@ -3311,6 +3341,15 @@ class BoUpSLP {
3311
3341
/// Return true if this is a non-power-of-2 node.
3312
3342
bool isNonPowOf2Vec() const {
3313
3343
bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3344
+ return IsNonPowerOf2;
3345
+ }
3346
+
3347
+ /// Return true if this is a node, which tries to vectorize number of
3348
+ /// elements, forming whole vectors.
3349
+ bool
3350
+ hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3351
+ bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3352
+ TTI, getValueType(Scalars.front()), Scalars.size());
3314
3353
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3315
3354
"Reshuffling not supported with non-power-of-2 vectors yet.");
3316
3355
return IsNonPowerOf2;
@@ -3430,8 +3469,10 @@ class BoUpSLP {
3430
3469
Last->State = EntryState;
3431
3470
// FIXME: Remove once support for ReuseShuffleIndices has been implemented
3432
3471
// for non-power-of-two vectors.
3433
- assert((has_single_bit(VL.size()) || ReuseShuffleIndices.empty()) &&
3434
- "Reshuffling scalars not yet supported for nodes with padding");
3472
+ assert(
3473
+ (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
3474
+ ReuseShuffleIndices.empty()) &&
3475
+ "Reshuffling scalars not yet supported for nodes with padding");
3435
3476
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3436
3477
ReuseShuffleIndices.end());
3437
3478
if (ReorderIndices.empty()) {
@@ -4412,7 +4453,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4412
4453
return std::nullopt;
4413
4454
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4414
4455
int NumParts = TTI->getNumberOfParts(VecTy);
4415
- if (NumParts == 0 || NumParts >= NumScalars)
4456
+ if (NumParts == 0 || NumParts >= NumScalars ||
4457
+ VecTy->getNumElements() % NumParts != 0)
4416
4458
NumParts = 1;
4417
4459
SmallVector<int> ExtractMask;
4418
4460
SmallVector<int> Mask;
@@ -5269,7 +5311,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5269
5311
// node.
5270
5312
if (!TE.ReuseShuffleIndices.empty()) {
5271
5313
// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5272
- assert(!TE.isNonPowOf2Vec( ) &&
5314
+ assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI ) &&
5273
5315
"Reshuffling scalars not yet supported for nodes with padding");
5274
5316
5275
5317
if (isSplat(TE.Scalars))
@@ -5509,7 +5551,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5509
5551
}
5510
5552
// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5511
5553
// has been auditted for correctness with non-power-of-two vectors.
5512
- if (!TE.isNonPowOf2Vec( ))
5554
+ if (!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI ))
5513
5555
if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5514
5556
return CurrentOrder;
5515
5557
}
@@ -5662,15 +5704,18 @@ void BoUpSLP::reorderTopToBottom() {
5662
5704
});
5663
5705
5664
5706
// Reorder the graph nodes according to their vectorization factor.
5665
- for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5666
- VF = bit_ceil (VF) / 2 ) {
5707
+ for (unsigned VF = VectorizableTree.front()->getVectorFactor();
5708
+ !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U) ) {
5667
5709
auto It = VFToOrderedEntries.find(VF);
5668
5710
if (It == VFToOrderedEntries.end())
5669
5711
continue;
5670
5712
// Try to find the most profitable order. We just are looking for the most
5671
5713
// used order and reorder scalar elements in the nodes according to this
5672
5714
// mostly used order.
5673
5715
ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5716
+ // Delete VF entry upon exit.
5717
+ auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
5718
+
5674
5719
// All operands are reordered and used only in this node - propagate the
5675
5720
// most used order to the user node.
5676
5721
MapVector<OrdersType, unsigned,
@@ -6413,7 +6458,8 @@ static void gatherPossiblyVectorizableLoads(
6413
6458
if (NumScalars > 1) {
6414
6459
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6415
6460
NumParts = TTI.getNumberOfParts(VecTy);
6416
- if (NumParts == 0 || NumParts >= NumScalars)
6461
+ if (NumParts == 0 || NumParts >= NumScalars ||
6462
+ VecTy->getNumElements() % NumParts != 0)
6417
6463
NumParts = 1;
6418
6464
}
6419
6465
unsigned VF = PowerOf2Ceil(NumScalars / NumParts);
@@ -7529,33 +7575,36 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
7529
7575
UniqueValues.emplace_back(V);
7530
7576
}
7531
7577
size_t NumUniqueScalarValues = UniqueValues.size();
7532
- if (NumUniqueScalarValues == VL.size()) {
7578
+ bool IsFullVectors = hasFullVectorsOrPowerOf2(
7579
+ *TTI, UniqueValues.front()->getType(), NumUniqueScalarValues);
7580
+ if (NumUniqueScalarValues == VL.size() &&
7581
+ (VectorizeNonPowerOf2 || IsFullVectors)) {
7533
7582
ReuseShuffleIndices.clear();
7534
7583
} else {
7535
7584
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
7536
- if ((UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) ||
7537
- !llvm::has_single_bit(VL.size())) {
7585
+ if ((UserTreeIdx.UserTE &&
7586
+ UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
7587
+ !has_single_bit(VL.size())) {
7538
7588
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
7539
7589
"for nodes with padding.\n");
7540
7590
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7541
7591
return false;
7542
7592
}
7543
7593
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
7544
- if (NumUniqueScalarValues <= 1 ||
7545
- (UniquePositions.size() == 1 && all_of(UniqueValues,
7546
- [](Value *V) {
7547
- return isa<UndefValue>(V) ||
7548
- !isConstant(V);
7549
- })) ||
7550
- !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
7594
+ if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
7595
+ (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
7596
+ return isa<UndefValue>(V) || !isConstant(V);
7597
+ }))) {
7551
7598
if (DoNotFail && UniquePositions.size() > 1 &&
7552
7599
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
7553
7600
all_of(UniqueValues, [=](Value *V) {
7554
7601
return isa<ExtractElementInst>(V) ||
7555
7602
areAllUsersVectorized(cast<Instruction>(V),
7556
7603
UserIgnoreList);
7557
7604
})) {
7558
- unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
7605
+ // Find the number of elements, which forms full vectors.
7606
+ unsigned PWSz = getFullVectorNumberOfElements(
7607
+ *TTI, UniqueValues.front()->getType(), UniqueValues.size());
7559
7608
if (PWSz == VL.size()) {
7560
7609
ReuseShuffleIndices.clear();
7561
7610
} else {
@@ -9793,9 +9842,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
9793
9842
return nullptr;
9794
9843
Value *VecBase = nullptr;
9795
9844
ArrayRef<Value *> VL = E->Scalars;
9796
- // If the resulting type is scalarized, do not adjust the cost.
9797
- if (NumParts == VL.size())
9798
- return nullptr;
9799
9845
// Check if it can be considered reused if same extractelements were
9800
9846
// vectorized already.
9801
9847
bool PrevNodeFound = any_of(
@@ -9911,7 +9957,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
9911
9957
assert(!CommonMask.empty() && "Expected non-empty common mask.");
9912
9958
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9913
9959
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9914
- if (NumParts == 0 || NumParts >= Mask.size())
9960
+ if (NumParts == 0 || NumParts >= Mask.size() ||
9961
+ MaskVecTy->getNumElements() % NumParts != 0)
9915
9962
NumParts = 1;
9916
9963
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9917
9964
const auto *It =
@@ -9928,7 +9975,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
9928
9975
assert(!CommonMask.empty() && "Expected non-empty common mask.");
9929
9976
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9930
9977
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9931
- if (NumParts == 0 || NumParts >= Mask.size())
9978
+ if (NumParts == 0 || NumParts >= Mask.size() ||
9979
+ MaskVecTy->getNumElements() % NumParts != 0)
9932
9980
NumParts = 1;
9933
9981
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9934
9982
const auto *It =
@@ -10450,7 +10498,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
10450
10498
InsertMask[Idx] = I + 1;
10451
10499
}
10452
10500
unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
10453
- if (NumOfParts > 0)
10501
+ if (NumOfParts > 0 && NumOfParts < NumElts )
10454
10502
VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
10455
10503
unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
10456
10504
VecScalarsSz;
@@ -13579,7 +13627,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
13579
13627
Type *OrigScalarTy = GatheredScalars.front()->getType();
13580
13628
auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
13581
13629
unsigned NumParts = TTI->getNumberOfParts(VecTy);
13582
- if (NumParts == 0 || NumParts >= GatheredScalars.size())
13630
+ if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
13631
+ VecTy->getNumElements() % NumParts != 0)
13583
13632
NumParts = 1;
13584
13633
if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
13585
13634
// Check for gathered extracts.
@@ -17785,7 +17834,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
17785
17834
for (unsigned I = NextInst; I < MaxInst; ++I) {
17786
17835
unsigned ActualVF = std::min(MaxInst - I, VF);
17787
17836
17788
- if (!has_single_bit( ActualVF))
17837
+ if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
17789
17838
continue;
17790
17839
17791
17840
if (MaxVFOnly && ActualVF < MaxVF)
0 commit comments