@@ -278,6 +278,22 @@ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
278
278
return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
279
279
}
280
280
281
+ /// Returns the number of elements of the given type \p Ty, not greater than \p
282
+ /// Sz, which forms type, which splits by \p TTI into whole vector types during
283
+ /// legalization.
284
+ static unsigned
285
+ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
286
+ unsigned Sz) {
287
+ if (!isValidElementType(Ty))
288
+ return bit_floor(Sz);
289
+ // Find the number of elements, which forms full vectors.
290
+ unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
291
+ if (NumParts == 0 || NumParts >= Sz)
292
+ return bit_floor(Sz);
293
+ unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
294
+ return (Sz / RegVF) * RegVF;
295
+ }
296
+
281
297
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
282
298
SmallVectorImpl<int> &Mask) {
283
299
// The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -7716,7 +7732,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
7716
7732
}
7717
7733
size_t NumUniqueScalarValues = UniqueValues.size();
7718
7734
bool IsFullVectors = hasFullVectorsOrPowerOf2(
7719
- *TTI, UniqueValues.front()->getType( ), NumUniqueScalarValues);
7735
+ *TTI, getValueType( UniqueValues.front()), NumUniqueScalarValues);
7720
7736
if (NumUniqueScalarValues == VL.size() &&
7721
7737
(VectorizeNonPowerOf2 || IsFullVectors)) {
7722
7738
ReuseShuffleIndices.clear();
@@ -17466,7 +17482,11 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
17466
17482
const unsigned Sz = R.getVectorElementSize(Chain[0]);
17467
17483
unsigned VF = Chain.size();
17468
17484
17469
- if (!has_single_bit(Sz) || !has_single_bit(VF) || VF < 2 || VF < MinVF) {
17485
+ if (!has_single_bit(Sz) ||
17486
+ !hasFullVectorsOrPowerOf2(
17487
+ *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
17488
+ VF) ||
17489
+ VF < 2 || VF < MinVF) {
17470
17490
// Check if vectorizing with a non-power-of-2 VF should be considered. At
17471
17491
// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
17472
17492
// all vector lanes are used.
@@ -17484,10 +17504,12 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
17484
17504
InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
17485
17505
if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
17486
17506
DenseSet<Value *> Stores(Chain.begin(), Chain.end());
17487
- bool IsPowerOf2 =
17488
- has_single_bit(ValOps.size()) ||
17507
+ bool IsAllowedSize =
17508
+ hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
17509
+ ValOps.size()) ||
17489
17510
(VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
17490
- if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
17511
+ if ((!IsAllowedSize && S.getOpcode() &&
17512
+ S.getOpcode() != Instruction::Load &&
17491
17513
(!S.MainOp->isSafeToRemove() ||
17492
17514
any_of(ValOps.getArrayRef(),
17493
17515
[&](Value *V) {
@@ -17498,7 +17520,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
17498
17520
}));
17499
17521
}))) ||
17500
17522
(ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
17501
- Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
17523
+ Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2;
17502
17524
return false;
17503
17525
}
17504
17526
}
@@ -17626,15 +17648,11 @@ bool SLPVectorizerPass::vectorizeStores(
17626
17648
17627
17649
unsigned MaxVF =
17628
17650
std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
17629
- unsigned MaxRegVF = MaxVF;
17630
17651
auto *Store = cast<StoreInst>(Operands[0]);
17631
17652
Type *StoreTy = Store->getValueOperand()->getType();
17632
17653
Type *ValueTy = StoreTy;
17633
17654
if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
17634
17655
ValueTy = Trunc->getSrcTy();
17635
- if (ValueTy == StoreTy &&
17636
- R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
17637
- MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
17638
17656
unsigned MinVF = std::max<unsigned>(
17639
17657
2, PowerOf2Ceil(TTI->getStoreMinimumVF(
17640
17658
R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
@@ -17652,10 +17670,21 @@ bool SLPVectorizerPass::vectorizeStores(
17652
17670
// First try vectorizing with a non-power-of-2 VF. At the moment, only
17653
17671
// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
17654
17672
// lanes are used.
17655
- unsigned CandVF =
17656
- std::clamp<unsigned>(Operands.size(), MaxVF, MaxRegVF);
17657
- if (has_single_bit(CandVF + 1))
17673
+ unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
17674
+ if (has_single_bit(CandVF + 1)) {
17658
17675
NonPowerOf2VF = CandVF;
17676
+ assert(NonPowerOf2VF != MaxVF &&
17677
+ "Non-power-of-2 VF should not be equal to MaxVF");
17678
+ }
17679
+ }
17680
+
17681
+ unsigned MaxRegVF = MaxVF;
17682
+ MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
17683
+ if (MaxVF < MinVF) {
17684
+ LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
17685
+ << ") < "
17686
+ << "MinVF (" << MinVF << ")\n");
17687
+ continue;
17659
17688
}
17660
17689
17661
17690
unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
@@ -17810,7 +17839,7 @@ bool SLPVectorizerPass::vectorizeStores(
17810
17839
std::bind(IsNotVectorized, Size >= MaxRegVF,
17811
17840
std::placeholders::_1)));
17812
17841
}
17813
- if (!AnyProfitableGraph && Size >= MaxRegVF)
17842
+ if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size) )
17814
17843
break;
17815
17844
}
17816
17845
// All values vectorized - exit.
@@ -17823,16 +17852,21 @@ bool SLPVectorizerPass::vectorizeStores(
17823
17852
(Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
17824
17853
break;
17825
17854
constexpr unsigned StoresLimit = 64;
17826
- const unsigned MaxTotalNum = bit_floor( std::min<unsigned>(
17855
+ const unsigned MaxTotalNum = std::min<unsigned>(
17827
17856
Operands.size(),
17828
17857
static_cast<unsigned>(
17829
17858
End -
17830
17859
std::distance(
17831
17860
RangeSizes.begin(),
17832
17861
find_if(RangeSizes, std::bind(IsNotVectorized, true,
17833
17862
std::placeholders::_1))) +
17834
- 1)));
17835
- unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
17863
+ 1));
17864
+ unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
17865
+ unsigned Limit =
17866
+ getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
17867
+ CandidateVFs.clear();
17868
+ if (bit_floor(Limit) == VF)
17869
+ CandidateVFs.push_back(Limit);
17836
17870
if (VF > MaxTotalNum || VF >= StoresLimit)
17837
17871
break;
17838
17872
for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
@@ -17841,7 +17875,6 @@ bool SLPVectorizerPass::vectorizeStores(
17841
17875
});
17842
17876
// Last attempt to vectorize max number of elements, if all previous
17843
17877
// attempts were unsuccessful because of the cost issues.
17844
- CandidateVFs.clear();
17845
17878
CandidateVFs.push_back(VF);
17846
17879
}
17847
17880
}
0 commit comments