Skip to content

Commit 947d8eb

Browse files
[SLP]Unify getNumberOfParts use
Adds getNumberOfParts and uses it instead of similar code across code base, fixes analysis of non-vectorizable types in computeMinimumValueSizes. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #124774
1 parent 1bc5fe6 commit 947d8eb

File tree

2 files changed

+60
-55
lines changed

2 files changed

+60
-55
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 50 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,6 +1314,22 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
13141314
Sz % NumParts == 0;
13151315
}
13161316

1317+
/// Returns number of parts, the type \p VecTy will be split at the codegen
1318+
/// phase. If the type is going to be scalarized or does not uses whole
1319+
/// registers, returns 1.
1320+
static unsigned
1321+
getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
1322+
const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1323+
unsigned NumParts = TTI.getNumberOfParts(VecTy);
1324+
if (NumParts == 0 || NumParts >= Limit)
1325+
return 1;
1326+
unsigned Sz = getNumElements(VecTy);
1327+
if (NumParts >= Sz || Sz % NumParts != 0 ||
1328+
!hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1329+
return 1;
1330+
return NumParts;
1331+
}
1332+
13171333
namespace slpvectorizer {
13181334

13191335
/// Bottom Up SLP Vectorizer.
@@ -4618,12 +4634,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
46184634
if (!isValidElementType(ScalarTy))
46194635
return std::nullopt;
46204636
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4621-
int NumParts = TTI->getNumberOfParts(VecTy);
4622-
if (NumParts == 0 || NumParts >= NumScalars ||
4623-
VecTy->getNumElements() % NumParts != 0 ||
4624-
!hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4625-
VecTy->getNumElements() / NumParts))
4626-
NumParts = 1;
4637+
unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
46274638
SmallVector<int> ExtractMask;
46284639
SmallVector<int> Mask;
46294640
SmallVector<SmallVector<const TreeEntry *>> Entries;
@@ -5574,8 +5585,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
55745585
}
55755586
}
55765587
if (Sz == 2 && TE.getVectorFactor() == 4 &&
5577-
TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5578-
2 * TE.getVectorFactor())) == 1)
5588+
::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
5589+
2 * TE.getVectorFactor())) == 1)
55795590
return std::nullopt;
55805591
if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
55815592
Sz)) {
@@ -9846,13 +9857,13 @@ void BoUpSLP::transformNodes() {
98469857
// Do not try to vectorize small splats (less than vector register and
98479858
// only with the single non-undef element).
98489859
bool IsSplat = isSplat(Slice);
9849-
if (Slices.empty() || !IsSplat ||
9850-
(VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9851-
Slice.front()->getType(), VF)),
9852-
1U, VF - 1) !=
9853-
std::clamp(TTI->getNumberOfParts(getWidenedType(
9854-
Slice.front()->getType(), 2 * VF)),
9855-
1U, 2 * VF)) ||
9860+
bool IsTwoRegisterSplat = true;
9861+
if (IsSplat && VF == 2) {
9862+
unsigned NumRegs2VF = ::getNumberOfParts(
9863+
*TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
9864+
IsTwoRegisterSplat = NumRegs2VF == 2;
9865+
}
9866+
if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
98569867
count(Slice, Slice.front()) ==
98579868
static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
98589869
: 1)) {
@@ -10793,12 +10804,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1079310804
}
1079410805
assert(!CommonMask.empty() && "Expected non-empty common mask.");
1079510806
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10796-
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10797-
if (NumParts == 0 || NumParts >= Mask.size() ||
10798-
MaskVecTy->getNumElements() % NumParts != 0 ||
10799-
!hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10800-
MaskVecTy->getNumElements() / NumParts))
10801-
NumParts = 1;
10807+
unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
1080210808
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
1080310809
const auto *It =
1080410810
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
@@ -10813,12 +10819,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1081310819
}
1081410820
assert(!CommonMask.empty() && "Expected non-empty common mask.");
1081510821
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10816-
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10817-
if (NumParts == 0 || NumParts >= Mask.size() ||
10818-
MaskVecTy->getNumElements() % NumParts != 0 ||
10819-
!hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10820-
MaskVecTy->getNumElements() / NumParts))
10821-
NumParts = 1;
10822+
unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
1082210823
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
1082310824
const auto *It =
1082410825
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
@@ -11351,7 +11352,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1135111352
unsigned const NumElts = SrcVecTy->getNumElements();
1135211353
unsigned const NumScalars = VL.size();
1135311354

11354-
unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
11355+
unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
1135511356

1135611357
SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
1135711358
unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -14862,12 +14863,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1486214863
SmallVector<SmallVector<const TreeEntry *>> Entries;
1486314864
Type *OrigScalarTy = GatheredScalars.front()->getType();
1486414865
auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14865-
unsigned NumParts = TTI->getNumberOfParts(VecTy);
14866-
if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14867-
VecTy->getNumElements() % NumParts != 0 ||
14868-
!hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
14869-
VecTy->getNumElements() / NumParts))
14870-
NumParts = 1;
14866+
unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
1487114867
if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
1487214868
// Check for gathered extracts.
1487314869
bool Resized = false;
@@ -14899,12 +14895,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1489914895
Resized = true;
1490014896
GatheredScalars.append(VF - GatheredScalars.size(),
1490114897
PoisonValue::get(OrigScalarTy));
14902-
NumParts = TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF));
14903-
if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14904-
VecTy->getNumElements() % NumParts != 0 ||
14905-
!hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
14906-
VecTy->getNumElements() / NumParts))
14907-
NumParts = 1;
14898+
NumParts =
14899+
::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
1490814900
}
1490914901
}
1491014902
}
@@ -17049,10 +17041,10 @@ void BoUpSLP::optimizeGatherSequence() {
1704917041
// Check if the last undefs actually change the final number of used vector
1705017042
// registers.
1705117043
return SM1.size() - LastUndefsCnt > 1 &&
17052-
TTI->getNumberOfParts(SI1->getType()) ==
17053-
TTI->getNumberOfParts(
17054-
getWidenedType(SI1->getType()->getElementType(),
17055-
SM1.size() - LastUndefsCnt));
17044+
::getNumberOfParts(*TTI, SI1->getType()) ==
17045+
::getNumberOfParts(
17046+
*TTI, getWidenedType(SI1->getType()->getElementType(),
17047+
SM1.size() - LastUndefsCnt));
1705617048
};
1705717049
// Perform O(N^2) search over the gather/shuffle sequences and merge identical
1705817050
// instructions. TODO: We can further optimize this scan if we split the
@@ -17829,9 +17821,12 @@ bool BoUpSLP::collectValuesToDemote(
1782917821
const unsigned VF = E.Scalars.size();
1783017822
Type *OrigScalarTy = E.Scalars.front()->getType();
1783117823
if (UniqueBases.size() <= 2 ||
17832-
TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17833-
TTI->getNumberOfParts(getWidenedType(
17834-
IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
17824+
::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) ==
17825+
::getNumberOfParts(
17826+
*TTI,
17827+
getWidenedType(
17828+
IntegerType::get(OrigScalarTy->getContext(), BitWidth),
17829+
VF)))
1783517830
ToDemote.push_back(E.Idx);
1783617831
}
1783717832
return Res;
@@ -18241,8 +18236,8 @@ void BoUpSLP::computeMinimumValueSizes() {
1824118236
[&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
1824218237
return 0u;
1824318238

18244-
unsigned NumParts = TTI->getNumberOfParts(
18245-
getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18239+
unsigned NumParts = ::getNumberOfParts(
18240+
*TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
1824618241

1824718242
// The maximum bit width required to represent all the values that can be
1824818243
// demoted without loss of precision. It would be safe to truncate the roots
@@ -18302,8 +18297,10 @@ void BoUpSLP::computeMinimumValueSizes() {
1830218297
// use - ignore it.
1830318298
if (NumParts > 1 &&
1830418299
NumParts ==
18305-
TTI->getNumberOfParts(getWidenedType(
18306-
IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
18300+
::getNumberOfParts(
18301+
*TTI, getWidenedType(IntegerType::get(F->getContext(),
18302+
bit_ceil(MaxBitWidth)),
18303+
VF)))
1830718304
return 0u;
1830818305

1830918306
unsigned Opcode = E.getOpcode();
@@ -20086,14 +20083,14 @@ class HorizontalReduction {
2008620083
ReduxWidth =
2008720084
getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
2008820085
VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20089-
NumParts = TTI.getNumberOfParts(Tp);
20086+
NumParts = ::getNumberOfParts(TTI, Tp);
2009020087
NumRegs =
2009120088
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
2009220089
while (NumParts > NumRegs) {
2009320090
assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
2009420091
ReduxWidth = bit_floor(ReduxWidth - 1);
2009520092
VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20096-
NumParts = TTI.getNumberOfParts(Tp);
20093+
NumParts = ::getNumberOfParts(TTI, Tp);
2009720094
NumRegs =
2009820095
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
2009920096
}

llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,17 @@ define void @partial_vec_invalid_cost() #0 {
77
; CHECK-LABEL: define void @partial_vec_invalid_cost(
88
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
99
; CHECK-NEXT: entry:
10-
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> zeroinitializer)
10+
; CHECK-NEXT: [[LSHR_1:%.*]] = lshr i96 0, 0
11+
; CHECK-NEXT: [[LSHR_2:%.*]] = lshr i96 0, 0
12+
; CHECK-NEXT: [[TRUNC_I96_1:%.*]] = trunc i96 [[LSHR_1]] to i32
13+
; CHECK-NEXT: [[TRUNC_I96_2:%.*]] = trunc i96 [[LSHR_2]] to i32
14+
; CHECK-NEXT: [[TRUNC_I96_3:%.*]] = trunc i96 0 to i32
15+
; CHECK-NEXT: [[TRUNC_I96_4:%.*]] = trunc i96 0 to i32
1116
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> zeroinitializer)
12-
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i32 [[TMP0]], [[TMP1]]
17+
; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP1]], [[TRUNC_I96_1]]
18+
; CHECK-NEXT: [[OP_RDX1:%.*]] = or i32 [[TRUNC_I96_2]], [[TRUNC_I96_3]]
19+
; CHECK-NEXT: [[OP_RDX2:%.*]] = or i32 [[OP_RDX]], [[OP_RDX1]]
20+
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i32 [[OP_RDX2]], [[TRUNC_I96_4]]
1321
; CHECK-NEXT: [[STORE_THIS:%.*]] = zext i32 [[OP_RDX3]] to i96
1422
; CHECK-NEXT: store i96 [[STORE_THIS]], ptr null, align 16
1523
; CHECK-NEXT: ret void

0 commit comments

Comments
 (0)