Skip to content

Commit 571c8c2

Browse files
committed
Revert "[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands."
This reverts commit a3ea90f after the post commit review. The number of parts is calculated incorrectly.
1 parent 884d7c1 commit 571c8c2

File tree

2 files changed

+42
-78
lines changed

2 files changed

+42
-78
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 33 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -260,20 +260,6 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
260260
VF * getNumElements(ScalarTy));
261261
}
262262

263-
/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
264-
/// which forms type, which splits by \p TTI into whole vector types during
265-
/// legalization.
266-
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
267-
Type *Ty, unsigned Sz) {
268-
if (!isValidElementType(Ty))
269-
return PowerOf2Ceil(Sz);
270-
// Find the number of elements, which forms full vectors.
271-
const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
272-
if (NumParts == 0 || NumParts >= Sz)
273-
return PowerOf2Ceil(Sz);
274-
return PowerOf2Ceil(divideCeil(Sz, NumParts)) * NumParts;
275-
}
276-
277263
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
278264
SmallVectorImpl<int> &Mask) {
279265
// The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -1237,22 +1223,6 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
12371223
(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
12381224
}
12391225

1240-
/// Returns true if widened type of \p Ty elements with size \p Sz represents
1241-
/// full vector type, i.e. adding extra element results in extra parts upon type
1242-
/// legalization.
1243-
static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty,
1244-
unsigned Sz) {
1245-
if (Sz <= 1)
1246-
return false;
1247-
if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1248-
return false;
1249-
if (has_single_bit(Sz))
1250-
return true;
1251-
const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
1252-
return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1253-
Sz % NumParts == 0;
1254-
}
1255-
12561226
namespace slpvectorizer {
12571227

12581228
/// Bottom Up SLP Vectorizer.
@@ -2496,9 +2466,7 @@ class BoUpSLP {
24962466
}
24972467
// TODO: Check if we can remove a check for non-power-2 number of
24982468
// scalars after full support of non-power-2 vectorization.
2499-
return UniqueValues.size() != 2 &&
2500-
hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(),
2501-
UniqueValues.size());
2469+
return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
25022470
};
25032471

25042472
// If the initial strategy fails for any of the operand indexes, then we
@@ -3307,9 +3275,8 @@ class BoUpSLP {
33073275
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
33083276

33093277
/// Return true if this is a non-power-of-2 node.
3310-
bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
3311-
bool IsNonPowerOf2 = !hasFullVectorsOnly(
3312-
TTI, getValueType(Scalars.front()), Scalars.size());
3278+
bool isNonPowOf2Vec() const {
3279+
bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
33133280
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
33143281
"Reshuffling not supported with non-power-of-2 vectors yet.");
33153282
return IsNonPowerOf2;
@@ -3487,7 +3454,7 @@ class BoUpSLP {
34873454

34883455
if (UserTreeIdx.UserTE) {
34893456
Last->UserTreeIndices.push_back(UserTreeIdx);
3490-
assert((!Last->isNonPowOf2Vec(*TTI) || Last->ReorderIndices.empty()) &&
3457+
assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
34913458
"Reordering isn't implemented for non-power-of-2 nodes yet");
34923459
}
34933460
return Last;
@@ -4393,7 +4360,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
43934360
if (!isValidElementType(ScalarTy))
43944361
return std::nullopt;
43954362
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4396-
int NumParts = TTI->getRegUsageForType(VecTy);
4363+
int NumParts = TTI->getNumberOfParts(VecTy);
43974364
if (NumParts == 0 || NumParts >= NumScalars)
43984365
NumParts = 1;
43994366
SmallVector<int> ExtractMask;
@@ -4765,7 +4732,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
47654732
// Check the order of pointer operands or that all pointers are the same.
47664733
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
47674734
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4768-
if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz)) {
4735+
if (!Order.empty() && !has_single_bit(VL.size())) {
47694736
assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
47704737
"supported with VectorizeNonPowerOf2");
47714738
return LoadsState::Gather;
@@ -4819,13 +4786,12 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
48194786
});
48204787
});
48214788
const unsigned AbsoluteDiff = std::abs(*Diff);
4822-
if (IsPossibleStrided &&
4823-
(IsAnyPointerUsedOutGraph ||
4824-
((Sz > MinProfitableStridedLoads ||
4825-
(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
4826-
hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) &&
4827-
AbsoluteDiff > Sz) ||
4828-
*Diff == -(static_cast<int>(Sz) - 1))) {
4789+
if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4790+
((Sz > MinProfitableStridedLoads ||
4791+
(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
4792+
has_single_bit(AbsoluteDiff))) &&
4793+
AbsoluteDiff > Sz) ||
4794+
*Diff == -(static_cast<int>(Sz) - 1))) {
48294795
int Stride = *Diff / static_cast<int>(Sz - 1);
48304796
if (*Diff == Stride * static_cast<int>(Sz - 1)) {
48314797
Align Alignment =
@@ -5230,7 +5196,7 @@ static bool areTwoInsertFromSameBuildVector(
52305196
std::optional<BoUpSLP::OrdersType>
52315197
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
52325198
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
5233-
if (TE.isNonPowOf2Vec(*TTI))
5199+
if (TE.isNonPowOf2Vec())
52345200
return std::nullopt;
52355201

52365202
// No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -5264,8 +5230,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
52645230
}
52655231
}
52665232
if (Sz == 2 && TE.getVectorFactor() == 4 &&
5267-
TTI->getRegUsageForType(getWidenedType(TE.Scalars.front()->getType(),
5268-
2 * TE.getVectorFactor())) == 1)
5233+
TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5234+
2 * TE.getVectorFactor())) == 1)
52695235
return std::nullopt;
52705236
if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
52715237
Sz)) {
@@ -5614,7 +5580,7 @@ void BoUpSLP::reorderTopToBottom() {
56145580

56155581
// Reorder the graph nodes according to their vectorization factor.
56165582
for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5617-
VF -= 2) {
5583+
VF /= 2) {
56185584
auto It = VFToOrderedEntries.find(VF);
56195585
if (It == VFToOrderedEntries.end())
56205586
continue;
@@ -5787,7 +5753,7 @@ bool BoUpSLP::canReorderOperands(
57875753
ArrayRef<TreeEntry *> ReorderableGathers,
57885754
SmallVectorImpl<TreeEntry *> &GatherOps) {
57895755
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5790-
if (UserTE->isNonPowOf2Vec(*TTI))
5756+
if (UserTE->isNonPowOf2Vec())
57915757
return false;
57925758

57935759
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
@@ -5962,7 +5928,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
59625928
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
59635929
const auto AllowsReordering = [&](const TreeEntry *TE) {
59645930
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5965-
if (TE->isNonPowOf2Vec(*TTI))
5931+
if (TE->isNonPowOf2Vec())
59665932
return false;
59675933
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
59685934
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -6615,7 +6581,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
66156581
case Instruction::ExtractElement: {
66166582
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
66176583
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6618-
if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size()))
6584+
if (!has_single_bit(VL.size()))
66196585
return TreeEntry::NeedToGather;
66206586
if (Reuse || !CurrentOrder.empty())
66216587
return TreeEntry::Vectorize;
@@ -7025,7 +6991,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
70256991
ReuseShuffleIndices.clear();
70266992
} else {
70276993
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
7028-
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI)) {
6994+
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
70296995
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
70306996
"for nodes with padding.\n");
70316997
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -7038,18 +7004,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
70387004
return isa<UndefValue>(V) ||
70397005
!isConstant(V);
70407006
})) ||
7041-
!hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(),
7042-
NumUniqueScalarValues)) {
7007+
!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
70437008
if (DoNotFail && UniquePositions.size() > 1 &&
70447009
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
70457010
all_of(UniqueValues, [=](Value *V) {
70467011
return isa<ExtractElementInst>(V) ||
70477012
areAllUsersVectorized(cast<Instruction>(V),
70487013
UserIgnoreList);
70497014
})) {
7050-
// Find the number of elements, which forms full vectors.
7051-
unsigned PWSz = getFullVectorNumberOfElements(
7052-
*TTI, UniqueValues.front()->getType(), UniqueValues.size());
7015+
unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
70537016
if (PWSz == VL.size()) {
70547017
ReuseShuffleIndices.clear();
70557018
} else {
@@ -9260,7 +9223,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
92609223
}
92619224
assert(!CommonMask.empty() && "Expected non-empty common mask.");
92629225
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9263-
unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
9226+
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
92649227
if (NumParts == 0 || NumParts >= Mask.size())
92659228
NumParts = 1;
92669229
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9277,7 +9240,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
92779240
}
92789241
assert(!CommonMask.empty() && "Expected non-empty common mask.");
92799242
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9280-
unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
9243+
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
92819244
if (NumParts == 0 || NumParts >= Mask.size())
92829245
NumParts = 1;
92839246
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9783,7 +9746,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
97839746
unsigned const NumElts = SrcVecTy->getNumElements();
97849747
unsigned const NumScalars = VL.size();
97859748

9786-
unsigned NumOfParts = TTI->getRegUsageForType(SrcVecTy);
9749+
unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
97879750

97889751
SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
97899752
unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -11027,9 +10990,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1102710990
// Keep original scalar if number of externally used instructions in
1102810991
// the same entry is not power of 2. It may help to do some extra
1102910992
// vectorization for now.
11030-
KeepScalar =
11031-
ScalarUsesCount <= 1 ||
11032-
!hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount);
10993+
KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
1103310994
}
1103410995
if (KeepScalar) {
1103510996
ExternalUsesAsOriginalScalar.insert(EU.Scalar);
@@ -11726,14 +11687,13 @@ BoUpSLP::isGatherShuffledEntry(
1172611687
if (TE == VectorizableTree.front().get())
1172711688
return {};
1172811689
// FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11729-
if (TE->isNonPowOf2Vec(*TTI))
11690+
if (TE->isNonPowOf2Vec())
1173011691
return {};
1173111692
Mask.assign(VL.size(), PoisonMaskElem);
1173211693
assert(TE->UserTreeIndices.size() == 1 &&
1173311694
"Expected only single user of the gather node.");
11734-
// Number of scalars must be divisible by NumParts.
11735-
if (VL.size() % NumParts != 0)
11736-
return {};
11695+
assert(VL.size() % NumParts == 0 &&
11696+
"Number of scalars must be divisible by NumParts.");
1173711697
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
1173811698
SmallVector<std::optional<TTI::ShuffleKind>> Res;
1173911699
for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -12872,7 +12832,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1287212832
SmallVector<SmallVector<const TreeEntry *>> Entries;
1287312833
Type *OrigScalarTy = GatheredScalars.front()->getType();
1287412834
auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
12875-
unsigned NumParts = TTI->getRegUsageForType(VecTy);
12835+
unsigned NumParts = TTI->getNumberOfParts(VecTy);
1287612836
if (NumParts == 0 || NumParts >= GatheredScalars.size())
1287712837
NumParts = 1;
1287812838
if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
@@ -16126,7 +16086,7 @@ void BoUpSLP::computeMinimumValueSizes() {
1612616086
[&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
1612716087
return 0u;
1612816088

16129-
unsigned NumParts = TTI->getRegUsageForType(
16089+
unsigned NumParts = TTI->getNumberOfParts(
1613016090
getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
1613116091

1613216092
// The maximum bit width required to represent all the values that can be
@@ -16183,7 +16143,7 @@ void BoUpSLP::computeMinimumValueSizes() {
1618316143
// use - ignore it.
1618416144
if (NumParts > 1 &&
1618516145
NumParts ==
16186-
TTI->getRegUsageForType(getWidenedType(
16146+
TTI->getNumberOfParts(getWidenedType(
1618716147
IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
1618816148
return 0u;
1618916149

@@ -17044,7 +17004,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1704417004
for (unsigned I = NextInst; I < MaxInst; ++I) {
1704517005
unsigned ActualVF = std::min(MaxInst - I, VF);
1704617006

17047-
if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF))
17007+
if (!has_single_bit(ActualVF))
1704817008
continue;
1704917009

1705017010
if (MaxVFOnly && ActualVF < MaxVF)

llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,15 @@
44
define i64 @test(ptr %p) {
55
; CHECK-LABEL: @test(
66
; CHECK-NEXT: entry:
7-
; CHECK-NEXT: [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4
8-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
9-
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
10-
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
11-
; CHECK-NEXT: ret i64 [[TMP3]]
7+
; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
8+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
9+
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
10+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
11+
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
12+
; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
13+
; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
14+
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
15+
; CHECK-NEXT: ret i64 [[TMP6]]
1216
;
1317
entry:
1418
%arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1

0 commit comments

Comments
 (0)