@@ -2656,7 +2656,9 @@ class BoUpSLP {
2656
2656
}
2657
2657
// TODO: Check if we can remove a check for non-power-2 number of
2658
2658
// scalars after full support of non-power-2 vectorization.
2659
- return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2659
+ return UniqueValues.size() != 2 &&
2660
+ hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
2661
+ UniqueValues.size());
2660
2662
};
2661
2663
2662
2664
// If the initial strategy fails for any of the operand indexes, then we
@@ -5101,12 +5103,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
5101
5103
});
5102
5104
});
5103
5105
const unsigned AbsoluteDiff = std::abs(*Diff);
5104
- if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5105
- ((Sz > MinProfitableStridedLoads ||
5106
- (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5107
- has_single_bit(AbsoluteDiff))) &&
5108
- AbsoluteDiff > Sz) ||
5109
- *Diff == -(static_cast<int>(Sz) - 1))) {
5106
+ if (IsPossibleStrided &&
5107
+ (IsAnyPointerUsedOutGraph ||
5108
+ (AbsoluteDiff > Sz &&
5109
+ (Sz > MinProfitableStridedLoads ||
5110
+ (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5111
+ AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
5112
+ *Diff == -(static_cast<int>(Sz) - 1))) {
5110
5113
int Stride = *Diff / static_cast<int>(Sz - 1);
5111
5114
if (*Diff == Stride * static_cast<int>(Sz - 1)) {
5112
5115
Align Alignment =
@@ -5192,17 +5195,20 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
5192
5195
return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5193
5196
5194
5197
// FIXME: The following code has not been updated for non-power-of-2
5195
- // vectors. The splitting logic here does not cover the original
5196
- // vector if the vector factor is not a power of two. FIXME
5197
- if (!has_single_bit( VL.size()))
5198
+ // vectors (and not whole registers) . The splitting logic here does not
5199
+ // cover the original vector if the vector factor is not a power of two.
5200
+ if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
5198
5201
return false;
5199
5202
5200
5203
unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
5201
5204
unsigned MinVF = getMinVF(2 * Sz);
5202
5205
DemandedElts.clearAllBits();
5203
5206
// Iterate through possible vectorization factors and check if vectorized +
5204
5207
// shuffles is better than just gather.
5205
- for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
5208
+ for (unsigned VF =
5209
+ getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
5210
+ VF >= MinVF;
5211
+ VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
5206
5212
SmallVector<LoadsState> States;
5207
5213
for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
5208
5214
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
@@ -7632,8 +7638,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7632
7638
case Instruction::ExtractValue:
7633
7639
case Instruction::ExtractElement: {
7634
7640
bool Reuse = canReuseExtract(VL, CurrentOrder);
7635
- // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
7636
- if (!has_single_bit(VL.size()))
7641
+ // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
7642
+ // non-full registers).
7643
+ if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
7637
7644
return TreeEntry::NeedToGather;
7638
7645
if (Reuse || !CurrentOrder.empty())
7639
7646
return TreeEntry::Vectorize;
@@ -8089,7 +8096,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
8089
8096
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8090
8097
if ((UserTreeIdx.UserTE &&
8091
8098
UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8092
- !has_single_bit( VL.size())) {
8099
+ !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
8093
8100
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
8094
8101
"for nodes with padding.\n");
8095
8102
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -9840,7 +9847,8 @@ void BoUpSLP::transformNodes() {
9840
9847
if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
9841
9848
(S.getOpcode() == Instruction::Load &&
9842
9849
areKnownNonVectorizableLoads(Slice)) ||
9843
- (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
9850
+ (S.getOpcode() != Instruction::Load &&
9851
+ !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
9844
9852
continue;
9845
9853
if (VF == 2) {
9846
9854
// Try to vectorize reduced values or if all users are vectorized.
@@ -13618,8 +13626,9 @@ BoUpSLP::isGatherShuffledEntry(
13618
13626
return !TE->isGather();
13619
13627
})))
13620
13628
return {};
13621
- // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
13622
- if (TE->isNonPowOf2Vec())
13629
+ // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
13630
+ // implemented yet.
13631
+ if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
13623
13632
return {};
13624
13633
Mask.assign(VL.size(), PoisonMaskElem);
13625
13634
assert((TE->UserTreeIndices.size() == 1 ||
@@ -19200,9 +19209,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
19200
19209
}
19201
19210
}
19202
19211
19212
+ Type *ScalarTy = getValueType(VL[0]);
19203
19213
unsigned Sz = R.getVectorElementSize(I0);
19204
19214
unsigned MinVF = R.getMinVF(Sz);
19205
- unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
19215
+ unsigned MaxVF = std::max<unsigned>(
19216
+ getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
19206
19217
MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19207
19218
if (MaxVF < 2) {
19208
19219
R.getORE()->emit([&]() {
@@ -19216,10 +19227,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
19216
19227
bool Changed = false;
19217
19228
bool CandidateFound = false;
19218
19229
InstructionCost MinCost = SLPCostThreshold.getValue();
19219
- Type *ScalarTy = getValueType(VL[0]);
19220
19230
19221
19231
unsigned NextInst = 0, MaxInst = VL.size();
19222
- for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
19232
+ for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19233
+ VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
19223
19234
// No actual vectorization should happen, if number of parts is the same as
19224
19235
// provided vectorization factor (i.e. the scalar type is used for vector
19225
19236
// code during codegen).
@@ -19234,7 +19245,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
19234
19245
19235
19246
if (MaxVFOnly && ActualVF < MaxVF)
19236
19247
break;
19237
- if ((VF > MinVF && ActualVF <= VF / 2 ) || (VF == MinVF && ActualVF < 2))
19248
+ if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19238
19249
break;
19239
19250
19240
19251
SmallVector<Value *> Ops(ActualVF, nullptr);
0 commit comments