Skip to content

Commit 9e86d4f

Browse files
committed
Revert "[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands."
This reverts commit 6ab07d7. This commit caused failed asserts, see #106449.
1 parent 22f9874 commit 9e86d4f

File tree

2 files changed

+42
-78
lines changed

2 files changed

+42
-78
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 33 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -260,20 +260,6 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
260260
VF * getNumElements(ScalarTy));
261261
}
262262

263-
/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
264-
/// which forms type, which splits by \p TTI into whole vector types during
265-
/// legalization.
266-
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
267-
Type *Ty, unsigned Sz) {
268-
if (!isValidElementType(Ty))
269-
return PowerOf2Ceil(Sz);
270-
// Find the number of elements, which forms full vectors.
271-
const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
272-
if (NumParts == 0 || NumParts == Sz)
273-
return PowerOf2Ceil(Sz);
274-
return PowerOf2Ceil(divideCeil(Sz, NumParts)) * NumParts;
275-
}
276-
277263
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
278264
SmallVectorImpl<int> &Mask) {
279265
// The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -1238,22 +1224,6 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
12381224
(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
12391225
}
12401226

1241-
/// Returns true if widened type of \p Ty elements with size \p Sz represents
1242-
/// full vector type, i.e. adding extra element results in extra parts upon type
1243-
/// legalization.
1244-
static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty,
1245-
unsigned Sz) {
1246-
if (Sz <= 1)
1247-
return false;
1248-
if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1249-
return false;
1250-
if (has_single_bit(Sz))
1251-
return true;
1252-
const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
1253-
return NumParts > 0 && NumParts != Sz && has_single_bit(Sz / NumParts) &&
1254-
Sz % NumParts == 0;
1255-
}
1256-
12571227
namespace slpvectorizer {
12581228

12591229
/// Bottom Up SLP Vectorizer.
@@ -2497,9 +2467,7 @@ class BoUpSLP {
24972467
}
24982468
// TODO: Check if we can remove a check for non-power-2 number of
24992469
// scalars after full support of non-power-2 vectorization.
2500-
return UniqueValues.size() != 2 &&
2501-
hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(),
2502-
UniqueValues.size());
2470+
return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
25032471
};
25042472

25052473
// If the initial strategy fails for any of the operand indexes, then we
@@ -3308,9 +3276,8 @@ class BoUpSLP {
33083276
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
33093277

33103278
/// Return true if this is a non-power-of-2 node.
3311-
bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
3312-
bool IsNonPowerOf2 = !hasFullVectorsOnly(
3313-
TTI, getValueType(Scalars.front()), Scalars.size());
3279+
bool isNonPowOf2Vec() const {
3280+
bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
33143281
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
33153282
"Reshuffling not supported with non-power-of-2 vectors yet.");
33163283
return IsNonPowerOf2;
@@ -3488,7 +3455,7 @@ class BoUpSLP {
34883455

34893456
if (UserTreeIdx.UserTE) {
34903457
Last->UserTreeIndices.push_back(UserTreeIdx);
3491-
assert((!Last->isNonPowOf2Vec(*TTI) || Last->ReorderIndices.empty()) &&
3458+
assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
34923459
"Reordering isn't implemented for non-power-of-2 nodes yet");
34933460
}
34943461
return Last;
@@ -4394,7 +4361,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
43944361
if (!isValidElementType(ScalarTy))
43954362
return std::nullopt;
43964363
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4397-
int NumParts = TTI->getRegUsageForType(VecTy);
4364+
int NumParts = TTI->getNumberOfParts(VecTy);
43984365
if (NumParts == 0 || NumParts >= NumScalars)
43994366
NumParts = 1;
44004367
SmallVector<int> ExtractMask;
@@ -4766,7 +4733,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
47664733
// Check the order of pointer operands or that all pointers are the same.
47674734
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
47684735
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4769-
if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz)) {
4736+
if (!Order.empty() && !has_single_bit(VL.size())) {
47704737
assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
47714738
"supported with VectorizeNonPowerOf2");
47724739
return LoadsState::Gather;
@@ -4820,13 +4787,12 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
48204787
});
48214788
});
48224789
const unsigned AbsoluteDiff = std::abs(*Diff);
4823-
if (IsPossibleStrided &&
4824-
(IsAnyPointerUsedOutGraph ||
4825-
((Sz > MinProfitableStridedLoads ||
4826-
(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
4827-
hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) &&
4828-
AbsoluteDiff > Sz) ||
4829-
*Diff == -(static_cast<int>(Sz) - 1))) {
4790+
if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4791+
((Sz > MinProfitableStridedLoads ||
4792+
(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
4793+
has_single_bit(AbsoluteDiff))) &&
4794+
AbsoluteDiff > Sz) ||
4795+
*Diff == -(static_cast<int>(Sz) - 1))) {
48304796
int Stride = *Diff / static_cast<int>(Sz - 1);
48314797
if (*Diff == Stride * static_cast<int>(Sz - 1)) {
48324798
Align Alignment =
@@ -5231,7 +5197,7 @@ static bool areTwoInsertFromSameBuildVector(
52315197
std::optional<BoUpSLP::OrdersType>
52325198
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
52335199
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
5234-
if (TE.isNonPowOf2Vec(*TTI))
5200+
if (TE.isNonPowOf2Vec())
52355201
return std::nullopt;
52365202

52375203
// No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -5265,8 +5231,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
52655231
}
52665232
}
52675233
if (Sz == 2 && TE.getVectorFactor() == 4 &&
5268-
TTI->getRegUsageForType(getWidenedType(TE.Scalars.front()->getType(),
5269-
2 * TE.getVectorFactor())) == 1)
5234+
TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5235+
2 * TE.getVectorFactor())) == 1)
52705236
return std::nullopt;
52715237
if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
52725238
Sz)) {
@@ -5615,7 +5581,7 @@ void BoUpSLP::reorderTopToBottom() {
56155581

56165582
// Reorder the graph nodes according to their vectorization factor.
56175583
for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5618-
VF -= 2) {
5584+
VF /= 2) {
56195585
auto It = VFToOrderedEntries.find(VF);
56205586
if (It == VFToOrderedEntries.end())
56215587
continue;
@@ -5788,7 +5754,7 @@ bool BoUpSLP::canReorderOperands(
57885754
ArrayRef<TreeEntry *> ReorderableGathers,
57895755
SmallVectorImpl<TreeEntry *> &GatherOps) {
57905756
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5791-
if (UserTE->isNonPowOf2Vec(*TTI))
5757+
if (UserTE->isNonPowOf2Vec())
57925758
return false;
57935759

57945760
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
@@ -5963,7 +5929,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
59635929
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
59645930
const auto AllowsReordering = [&](const TreeEntry *TE) {
59655931
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5966-
if (TE->isNonPowOf2Vec(*TTI))
5932+
if (TE->isNonPowOf2Vec())
59675933
return false;
59685934
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
59695935
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -6609,7 +6575,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
66096575
case Instruction::ExtractElement: {
66106576
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
66116577
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6612-
if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size()))
6578+
if (!has_single_bit(VL.size()))
66136579
return TreeEntry::NeedToGather;
66146580
if (Reuse || !CurrentOrder.empty())
66156581
return TreeEntry::Vectorize;
@@ -7019,7 +6985,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
70196985
ReuseShuffleIndices.clear();
70206986
} else {
70216987
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
7022-
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI)) {
6988+
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
70236989
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
70246990
"for nodes with padding.\n");
70256991
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -7032,18 +6998,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
70326998
return isa<UndefValue>(V) ||
70336999
!isConstant(V);
70347000
})) ||
7035-
!hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(),
7036-
NumUniqueScalarValues)) {
7001+
!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
70377002
if (DoNotFail && UniquePositions.size() > 1 &&
70387003
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
70397004
all_of(UniqueValues, [=](Value *V) {
70407005
return isa<ExtractElementInst>(V) ||
70417006
areAllUsersVectorized(cast<Instruction>(V),
70427007
UserIgnoreList);
70437008
})) {
7044-
// Find the number of elements, which forms full vectors.
7045-
unsigned PWSz = getFullVectorNumberOfElements(
7046-
*TTI, UniqueValues.front()->getType(), UniqueValues.size());
7009+
unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
70477010
if (PWSz == VL.size()) {
70487011
ReuseShuffleIndices.clear();
70497012
} else {
@@ -9254,7 +9217,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
92549217
}
92559218
assert(!CommonMask.empty() && "Expected non-empty common mask.");
92569219
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9257-
unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
9220+
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
92589221
if (NumParts == 0 || NumParts >= Mask.size())
92599222
NumParts = 1;
92609223
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9271,7 +9234,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
92719234
}
92729235
assert(!CommonMask.empty() && "Expected non-empty common mask.");
92739236
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9274-
unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
9237+
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
92759238
if (NumParts == 0 || NumParts >= Mask.size())
92769239
NumParts = 1;
92779240
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9777,7 +9740,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
97779740
unsigned const NumElts = SrcVecTy->getNumElements();
97789741
unsigned const NumScalars = VL.size();
97799742

9780-
unsigned NumOfParts = TTI->getRegUsageForType(SrcVecTy);
9743+
unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
97819744

97829745
SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
97839746
unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -10993,9 +10956,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1099310956
// Keep original scalar if number of externally used instructions in
1099410957
// the same entry is not power of 2. It may help to do some extra
1099510958
// vectorization for now.
10996-
KeepScalar =
10997-
ScalarUsesCount <= 1 ||
10998-
!hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount);
10959+
KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
1099910960
}
1100010961
if (KeepScalar) {
1100110962
ExternalUsesAsOriginalScalar.insert(EU.Scalar);
@@ -11688,14 +11649,13 @@ BoUpSLP::isGatherShuffledEntry(
1168811649
if (TE == VectorizableTree.front().get())
1168911650
return {};
1169011651
// FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11691-
if (TE->isNonPowOf2Vec(*TTI))
11652+
if (TE->isNonPowOf2Vec())
1169211653
return {};
1169311654
Mask.assign(VL.size(), PoisonMaskElem);
1169411655
assert(TE->UserTreeIndices.size() == 1 &&
1169511656
"Expected only single user of the gather node.");
11696-
// Number of scalars must be divisible by NumParts.
11697-
if (VL.size() % NumParts != 0)
11698-
return {};
11657+
assert(VL.size() % NumParts == 0 &&
11658+
"Number of scalars must be divisible by NumParts.");
1169911659
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
1170011660
SmallVector<std::optional<TTI::ShuffleKind>> Res;
1170111661
for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -12834,7 +12794,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1283412794
SmallVector<SmallVector<const TreeEntry *>> Entries;
1283512795
Type *OrigScalarTy = GatheredScalars.front()->getType();
1283612796
auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
12837-
unsigned NumParts = TTI->getRegUsageForType(VecTy);
12797+
unsigned NumParts = TTI->getNumberOfParts(VecTy);
1283812798
if (NumParts == 0 || NumParts >= GatheredScalars.size())
1283912799
NumParts = 1;
1284012800
if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
@@ -16080,7 +16040,7 @@ void BoUpSLP::computeMinimumValueSizes() {
1608016040
[&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
1608116041
return 0u;
1608216042

16083-
unsigned NumParts = TTI->getRegUsageForType(
16043+
unsigned NumParts = TTI->getNumberOfParts(
1608416044
getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
1608516045

1608616046
// The maximum bit width required to represent all the values that can be
@@ -16137,7 +16097,7 @@ void BoUpSLP::computeMinimumValueSizes() {
1613716097
// use - ignore it.
1613816098
if (NumParts > 1 &&
1613916099
NumParts ==
16140-
TTI->getRegUsageForType(getWidenedType(
16100+
TTI->getNumberOfParts(getWidenedType(
1614116101
IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
1614216102
return 0u;
1614316103

@@ -16998,7 +16958,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1699816958
for (unsigned I = NextInst; I < MaxInst; ++I) {
1699916959
unsigned ActualVF = std::min(MaxInst - I, VF);
1700016960

17001-
if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF))
16961+
if (!has_single_bit(ActualVF))
1700216962
continue;
1700316963

1700416964
if (MaxVFOnly && ActualVF < MaxVF)

llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,15 @@
44
define i64 @test(ptr %p) {
55
; CHECK-LABEL: @test(
66
; CHECK-NEXT: entry:
7-
; CHECK-NEXT: [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4
8-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
9-
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
10-
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
11-
; CHECK-NEXT: ret i64 [[TMP3]]
7+
; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
8+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
9+
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
10+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
11+
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
12+
; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
13+
; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
14+
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
15+
; CHECK-NEXT: ret i64 [[TMP6]]
1216
;
1317
entry:
1418
%arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1

0 commit comments

Comments
 (0)