@@ -1314,6 +1314,22 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
1314
1314
Sz % NumParts == 0;
1315
1315
}
1316
1316
1317
+ /// Returns number of parts, the type \p VecTy will be split at the codegen
1318
+ /// phase. If the type is going to be scalarized or does not uses whole
1319
+ /// registers, returns 1.
1320
+ static unsigned
1321
+ getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
1322
+ const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1323
+ unsigned NumParts = TTI.getNumberOfParts(VecTy);
1324
+ if (NumParts == 0 || NumParts >= Limit)
1325
+ return 1;
1326
+ unsigned Sz = getNumElements(VecTy);
1327
+ if (NumParts >= Sz || Sz % NumParts != 0 ||
1328
+ !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1329
+ return 1;
1330
+ return NumParts;
1331
+ }
1332
+
1317
1333
namespace slpvectorizer {
1318
1334
1319
1335
/// Bottom Up SLP Vectorizer.
@@ -4618,12 +4634,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4618
4634
if (!isValidElementType(ScalarTy))
4619
4635
return std::nullopt;
4620
4636
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4621
- int NumParts = TTI->getNumberOfParts(VecTy);
4622
- if (NumParts == 0 || NumParts >= NumScalars ||
4623
- VecTy->getNumElements() % NumParts != 0 ||
4624
- !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4625
- VecTy->getNumElements() / NumParts))
4626
- NumParts = 1;
4637
+ unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
4627
4638
SmallVector<int> ExtractMask;
4628
4639
SmallVector<int> Mask;
4629
4640
SmallVector<SmallVector<const TreeEntry *>> Entries;
@@ -5574,8 +5585,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5574
5585
}
5575
5586
}
5576
5587
if (Sz == 2 && TE.getVectorFactor() == 4 &&
5577
- TTI-> getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5578
- 2 * TE.getVectorFactor())) == 1)
5588
+ :: getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
5589
+ 2 * TE.getVectorFactor())) == 1)
5579
5590
return std::nullopt;
5580
5591
if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5581
5592
Sz)) {
@@ -9846,13 +9857,13 @@ void BoUpSLP::transformNodes() {
9846
9857
// Do not try to vectorize small splats (less than vector register and
9847
9858
// only with the single non-undef element).
9848
9859
bool IsSplat = isSplat(Slice);
9849
- if (Slices.empty() || !IsSplat ||
9850
- ( VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9851
- Slice.front()->getType(), VF)),
9852
- 1U, VF - 1) !=
9853
- std::clamp(TTI->getNumberOfParts(getWidenedType(
9854
- Slice.front()->getType(), 2 * VF)),
9855
- 1U, 2 * VF)) ||
9860
+ bool IsTwoRegisterSplat = true;
9861
+ if (IsSplat && VF == 2) {
9862
+ unsigned NumRegs2VF = ::getNumberOfParts(
9863
+ *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
9864
+ IsTwoRegisterSplat = NumRegs2VF == 2;
9865
+ }
9866
+ if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
9856
9867
count(Slice, Slice.front()) ==
9857
9868
static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9858
9869
: 1)) {
@@ -10793,12 +10804,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
10793
10804
}
10794
10805
assert(!CommonMask.empty() && "Expected non-empty common mask.");
10795
10806
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10796
- unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10797
- if (NumParts == 0 || NumParts >= Mask.size() ||
10798
- MaskVecTy->getNumElements() % NumParts != 0 ||
10799
- !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10800
- MaskVecTy->getNumElements() / NumParts))
10801
- NumParts = 1;
10807
+ unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
10802
10808
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10803
10809
const auto *It =
10804
10810
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
@@ -10813,12 +10819,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
10813
10819
}
10814
10820
assert(!CommonMask.empty() && "Expected non-empty common mask.");
10815
10821
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10816
- unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10817
- if (NumParts == 0 || NumParts >= Mask.size() ||
10818
- MaskVecTy->getNumElements() % NumParts != 0 ||
10819
- !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10820
- MaskVecTy->getNumElements() / NumParts))
10821
- NumParts = 1;
10822
+ unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
10822
10823
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10823
10824
const auto *It =
10824
10825
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
@@ -11351,7 +11352,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
11351
11352
unsigned const NumElts = SrcVecTy->getNumElements();
11352
11353
unsigned const NumScalars = VL.size();
11353
11354
11354
- unsigned NumOfParts = TTI-> getNumberOfParts(SrcVecTy);
11355
+ unsigned NumOfParts = :: getNumberOfParts(*TTI, SrcVecTy);
11355
11356
11356
11357
SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11357
11358
unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -14862,12 +14863,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
14862
14863
SmallVector<SmallVector<const TreeEntry *>> Entries;
14863
14864
Type *OrigScalarTy = GatheredScalars.front()->getType();
14864
14865
auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14865
- unsigned NumParts = TTI->getNumberOfParts(VecTy);
14866
- if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14867
- VecTy->getNumElements() % NumParts != 0 ||
14868
- !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
14869
- VecTy->getNumElements() / NumParts))
14870
- NumParts = 1;
14866
+ unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
14871
14867
if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14872
14868
// Check for gathered extracts.
14873
14869
bool Resized = false;
@@ -14899,12 +14895,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
14899
14895
Resized = true;
14900
14896
GatheredScalars.append(VF - GatheredScalars.size(),
14901
14897
PoisonValue::get(OrigScalarTy));
14902
- NumParts = TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF));
14903
- if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14904
- VecTy->getNumElements() % NumParts != 0 ||
14905
- !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
14906
- VecTy->getNumElements() / NumParts))
14907
- NumParts = 1;
14898
+ NumParts =
14899
+ ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
14908
14900
}
14909
14901
}
14910
14902
}
@@ -17049,10 +17041,10 @@ void BoUpSLP::optimizeGatherSequence() {
17049
17041
// Check if the last undefs actually change the final number of used vector
17050
17042
// registers.
17051
17043
return SM1.size() - LastUndefsCnt > 1 &&
17052
- TTI-> getNumberOfParts(SI1->getType()) ==
17053
- TTI-> getNumberOfParts(
17054
- getWidenedType(SI1->getType()->getElementType(),
17055
- SM1.size() - LastUndefsCnt));
17044
+ :: getNumberOfParts(*TTI, SI1->getType()) ==
17045
+ :: getNumberOfParts(
17046
+ *TTI, getWidenedType(SI1->getType()->getElementType(),
17047
+ SM1.size() - LastUndefsCnt));
17056
17048
};
17057
17049
// Perform O(N^2) search over the gather/shuffle sequences and merge identical
17058
17050
// instructions. TODO: We can further optimize this scan if we split the
@@ -17829,9 +17821,12 @@ bool BoUpSLP::collectValuesToDemote(
17829
17821
const unsigned VF = E.Scalars.size();
17830
17822
Type *OrigScalarTy = E.Scalars.front()->getType();
17831
17823
if (UniqueBases.size() <= 2 ||
17832
- TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17833
- TTI->getNumberOfParts(getWidenedType(
17834
- IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
17824
+ ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) ==
17825
+ ::getNumberOfParts(
17826
+ *TTI,
17827
+ getWidenedType(
17828
+ IntegerType::get(OrigScalarTy->getContext(), BitWidth),
17829
+ VF)))
17835
17830
ToDemote.push_back(E.Idx);
17836
17831
}
17837
17832
return Res;
@@ -18241,8 +18236,8 @@ void BoUpSLP::computeMinimumValueSizes() {
18241
18236
[&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18242
18237
return 0u;
18243
18238
18244
- unsigned NumParts = TTI-> getNumberOfParts(
18245
- getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18239
+ unsigned NumParts = :: getNumberOfParts(
18240
+ *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18246
18241
18247
18242
// The maximum bit width required to represent all the values that can be
18248
18243
// demoted without loss of precision. It would be safe to truncate the roots
@@ -18302,8 +18297,10 @@ void BoUpSLP::computeMinimumValueSizes() {
18302
18297
// use - ignore it.
18303
18298
if (NumParts > 1 &&
18304
18299
NumParts ==
18305
- TTI->getNumberOfParts(getWidenedType(
18306
- IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
18300
+ ::getNumberOfParts(
18301
+ *TTI, getWidenedType(IntegerType::get(F->getContext(),
18302
+ bit_ceil(MaxBitWidth)),
18303
+ VF)))
18307
18304
return 0u;
18308
18305
18309
18306
unsigned Opcode = E.getOpcode();
@@ -20086,14 +20083,14 @@ class HorizontalReduction {
20086
20083
ReduxWidth =
20087
20084
getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
20088
20085
VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20089
- NumParts = TTI. getNumberOfParts(Tp);
20086
+ NumParts = :: getNumberOfParts(TTI, Tp);
20090
20087
NumRegs =
20091
20088
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
20092
20089
while (NumParts > NumRegs) {
20093
20090
assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
20094
20091
ReduxWidth = bit_floor(ReduxWidth - 1);
20095
20092
VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20096
- NumParts = TTI. getNumberOfParts(Tp);
20093
+ NumParts = :: getNumberOfParts(TTI, Tp);
20097
20094
NumRegs =
20098
20095
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
20099
20096
}
0 commit comments