-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands. #106449
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesPatch adds basic support for non-power-of-2 number of elements in Full diff: https://github.com/llvm/llvm-project/pull/106449.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ef5ae9a1a9ccc6..248a7180d9c385 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
VF * getNumElements(ScalarTy));
}
+/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
+/// which forms type, which splits by \p TTI into whole vector types during
+/// legalization.
+static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
+ Type *Ty, unsigned Sz) {
+ if (!isValidElementType(Ty))
+ return PowerOf2Ceil(Sz);
+ // Find the number of elements, which forms full vectors.
+ const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
+ if (NumParts == 0 || NumParts == Sz)
+ return PowerOf2Ceil(Sz);
+ return PowerOf2Ceil(divideCeil(Sz, NumParts)) * NumParts;
+}
+
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
SmallVectorImpl<int> &Mask) {
// The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -1224,6 +1238,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
}
+/// Returns true if widened type of \p Ty elements with size \p Sz represents
+/// full vector type, i.e. adding extra element results in extra parts upon type
+/// legalization.
+static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty,
+ unsigned Sz) {
+ if (Sz <= 1)
+ return false;
+ if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
+ return false;
+ if (has_single_bit(Sz))
+ return true;
+ const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
+ return NumParts > 0 && NumParts != Sz && has_single_bit(Sz / NumParts) &&
+ Sz % NumParts == 0;
+}
+
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
@@ -2455,7 +2485,9 @@ class BoUpSLP {
}
// TODO: Check if we can remove a check for non-power-2 number of
// scalars after full support of non-power-2 vectorization.
- return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
+ return UniqueValues.size() != 2 &&
+ hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(),
+ UniqueValues.size());
};
// If the initial strategy fails for any of the operand indexes, then we
@@ -3246,8 +3278,9 @@ class BoUpSLP {
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
/// Return true if this is a non-power-of-2 node.
- bool isNonPowOf2Vec() const {
- bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
+ bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
+ bool IsNonPowerOf2 = !hasFullVectorsOnly(
+ TTI, getValueType(Scalars.front()), Scalars.size());
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
"Reshuffling not supported with non-power-of-2 vectors yet.");
return IsNonPowerOf2;
@@ -3425,7 +3458,7 @@ class BoUpSLP {
if (UserTreeIdx.UserTE) {
Last->UserTreeIndices.push_back(UserTreeIdx);
- assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
+ assert((!Last->isNonPowOf2Vec(*TTI) || Last->ReorderIndices.empty()) &&
"Reordering isn't implemented for non-power-of-2 nodes yet");
}
return Last;
@@ -4331,7 +4364,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
if (!isValidElementType(ScalarTy))
return std::nullopt;
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
- int NumParts = TTI->getNumberOfParts(VecTy);
+ int NumParts = TTI->getRegUsageForType(VecTy);
if (NumParts == 0 || NumParts >= NumScalars)
NumParts = 1;
SmallVector<int> ExtractMask;
@@ -4703,7 +4736,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
- if (!Order.empty() && !has_single_bit(VL.size())) {
+ if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz)) {
assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
"supported with VectorizeNonPowerOf2");
return LoadsState::Gather;
@@ -4761,7 +4794,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
(IsAnyPointerUsedOutGraph ||
((Sz > MinProfitableStridedLoads ||
(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
- has_single_bit(AbsoluteDiff))) &&
+ hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) &&
AbsoluteDiff > Sz) ||
*Diff == -(static_cast<int>(Sz) - 1))) {
int Stride = *Diff / static_cast<int>(Sz - 1);
@@ -5102,7 +5135,7 @@ static bool areTwoInsertFromSameBuildVector(
std::optional<BoUpSLP::OrdersType>
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
- if (TE.isNonPowOf2Vec())
+ if (TE.isNonPowOf2Vec(*TTI))
return std::nullopt;
// No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -5136,8 +5169,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
}
}
if (Sz == 2 && TE.getVectorFactor() == 4 &&
- TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
- 2 * TE.getVectorFactor())) == 1)
+ TTI->getRegUsageForType(getWidenedType(TE.Scalars.front()->getType(),
+ 2 * TE.getVectorFactor())) == 1)
return std::nullopt;
if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
Sz)) {
@@ -5491,7 +5524,7 @@ void BoUpSLP::reorderTopToBottom() {
// Reorder the graph nodes according to their vectorization factor.
for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
- VF /= 2) {
+ VF -= 2) {
auto It = VFToOrderedEntries.find(VF);
if (It == VFToOrderedEntries.end())
continue;
@@ -5671,7 +5704,7 @@ bool BoUpSLP::canReorderOperands(
ArrayRef<TreeEntry *> ReorderableGathers,
SmallVectorImpl<TreeEntry *> &GatherOps) {
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
- if (UserTE->isNonPowOf2Vec())
+ if (UserTE->isNonPowOf2Vec(*TTI))
return false;
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
@@ -5846,7 +5879,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
const auto AllowsReordering = [&](const TreeEntry *TE) {
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
- if (TE->isNonPowOf2Vec())
+ if (TE->isNonPowOf2Vec(*TTI))
return false;
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -6505,7 +6538,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case Instruction::ExtractElement: {
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
- if (!has_single_bit(VL.size()))
+ if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size()))
return TreeEntry::NeedToGather;
if (Reuse || !CurrentOrder.empty())
return TreeEntry::Vectorize;
@@ -6915,7 +6948,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
ReuseShuffleIndices.clear();
} else {
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
- if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI)) {
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
"for nodes with padding.\n");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -6928,7 +6961,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return isa<UndefValue>(V) ||
!isConstant(V);
})) ||
- !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
+ !hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(),
+ NumUniqueScalarValues)) {
if (DoNotFail && UniquePositions.size() > 1 &&
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
all_of(UniqueValues, [=](Value *V) {
@@ -6936,7 +6970,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
areAllUsersVectorized(cast<Instruction>(V),
UserIgnoreList);
})) {
- unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
+ // Find the number of elements, which forms full vectors.
+ unsigned PWSz = getFullVectorNumberOfElements(
+ *TTI, UniqueValues.front()->getType(), UniqueValues.size());
if (PWSz == VL.size()) {
ReuseShuffleIndices.clear();
} else {
@@ -9147,7 +9183,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
assert(!CommonMask.empty() && "Expected non-empty common mask.");
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
- unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+ unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
if (NumParts == 0 || NumParts >= Mask.size())
NumParts = 1;
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9164,7 +9200,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
assert(!CommonMask.empty() && "Expected non-empty common mask.");
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
- unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+ unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
if (NumParts == 0 || NumParts >= Mask.size())
NumParts = 1;
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9682,7 +9718,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
unsigned const NumElts = SrcVecTy->getNumElements();
unsigned const NumScalars = VL.size();
- unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
+ unsigned NumOfParts = TTI->getRegUsageForType(SrcVecTy);
SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -10911,7 +10947,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// Keep original scalar if number of externally used instructions in
// the same entry is not power of 2. It may help to do some extra
// vectorization for now.
- KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
+ KeepScalar =
+ ScalarUsesCount <= 1 ||
+ !hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount);
}
if (KeepScalar) {
ExternalUsesAsOriginalScalar.insert(EU.Scalar);
@@ -11602,13 +11640,14 @@ BoUpSLP::isGatherShuffledEntry(
if (TE == VectorizableTree.front().get())
return {};
// FIXME: Gathering for non-power-of-2 nodes not implemented yet.
- if (TE->isNonPowOf2Vec())
+ if (TE->isNonPowOf2Vec(*TTI))
return {};
Mask.assign(VL.size(), PoisonMaskElem);
assert(TE->UserTreeIndices.size() == 1 &&
"Expected only single user of the gather node.");
- assert(VL.size() % NumParts == 0 &&
- "Number of scalars must be divisible by NumParts.");
+ // Number of scalars must be divisible by NumParts.
+ if (VL.size() % NumParts != 0)
+ return {};
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
SmallVector<std::optional<TTI::ShuffleKind>> Res;
for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -12744,7 +12783,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
SmallVector<SmallVector<const TreeEntry *>> Entries;
Type *OrigScalarTy = GatheredScalars.front()->getType();
auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
- unsigned NumParts = TTI->getNumberOfParts(VecTy);
+ unsigned NumParts = TTI->getRegUsageForType(VecTy);
if (NumParts == 0 || NumParts >= GatheredScalars.size())
NumParts = 1;
if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
@@ -16030,7 +16069,7 @@ void BoUpSLP::computeMinimumValueSizes() {
[&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
return 0u;
- unsigned NumParts = TTI->getNumberOfParts(
+ unsigned NumParts = TTI->getRegUsageForType(
getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
// The maximum bit width required to represent all the values that can be
@@ -16087,7 +16126,7 @@ void BoUpSLP::computeMinimumValueSizes() {
// use - ignore it.
if (NumParts > 1 &&
NumParts ==
- TTI->getNumberOfParts(getWidenedType(
+ TTI->getRegUsageForType(getWidenedType(
IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
return 0u;
@@ -16947,7 +16986,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
for (unsigned I = NextInst; I < MaxInst; ++I) {
unsigned ActualVF = std::min(MaxInst - I, VF);
- if (!has_single_bit(ActualVF))
+ if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF))
continue;
if (MaxVFOnly && ActualVF < MaxVF)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll
index 54dc33dbc0d00b..c9a3158acdda34 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll
@@ -4,15 +4,11 @@
define i64 @test(ptr %p) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
-; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
-; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
-; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
-; CHECK-NEXT: ret i64 [[TMP6]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
+; CHECK-NEXT: ret i64 [[TMP3]]
;
entry:
%arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1
|
@@ -4331,7 +4364,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { | |||
if (!isValidElementType(ScalarTy)) | |||
return std::nullopt; | |||
auto *VecTy = getWidenedType(ScalarTy, NumScalars); | |||
int NumParts = TTI->getNumberOfParts(VecTy); | |||
int NumParts = TTI->getRegUsageForType(VecTy); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will we always need getNumberOfParts and getRegUsageForType do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In some places - yes. The reshuffling of the buildvector/gather nodes is based on register/parts shuffling. But in other places (where it checks for full register usage etc.) it will be removed later
This triggers failed asserts: int a[2];
int b, c, d, e;
void f() {
a[b + c * 8] = a[b + c * 8 + 1] = (long long)d + 70912 >> 30;
a[b + c * 8 + 2] = (long long)e + 70912 >> 30;
a[b + c * 8 + 3] = (long long)f + 70912 >> 30;
} Or int *a;
long long b, d;
long e(long long f) {
long c = f >> 4;
return c;
}
void g() {
long i, j;
int h;
{
long c = d >> 4;
j = b >> 4;
i = c;
}
h = e(b * 70);
a[0] = a[1] = i;
a[2] = j;
a[3] = h;
} Both reproduce like this:
|
Thanks for the report, feel free to revert for now, will fix next week |
Thanks, I pushed a revert now! |
…mber of elements in operands. Patch adds basic support for non-power-of-2 number of elements in operands. The patch still requires that this number addresses whole registers. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #106449
We see crashes also with the updated patch a3ea90f
Unfortunately I don't have a reproducer to share at the moment. I'll try to extract one but not sure I'll succeed. |
I'm hitting a separate assertion failure when building SPEC CPU 2017 for rva22u64_v at O3:
The reduced case below crashes with target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"
define void @_ZNK6dealii24TensorProductPolynomialsILi3EE17compute_grad_gradEjRKNS_5PointILi3EEE(ptr %agg.result) #0 personality ptr null {
entry:
%0 = load double, ptr null, align 8
%mul.1 = fmul double %0, 0.000000e+00
%arrayidx.i39.1 = getelementptr i8, ptr %agg.result, i64 8
%add.ptr.i41.1.1 = getelementptr i8, ptr null, i64 8
%1 = load double, ptr %add.ptr.i41.1.1, align 8
%mul.1.1 = fmul double %1, 0.000000e+00
%mul.2.1 = fmul double 0.000000e+00, %mul.1.1
store double %mul.2.1, ptr %arrayidx.i39.1, align 8
%arrayidx.i39.2 = getelementptr i8, ptr %agg.result, i64 16
%mul.1.2 = fmul double %0, 0.000000e+00
%mul.2.2 = fmul double 0.000000e+00, %mul.1.2
store double %mul.2.2, ptr %arrayidx.i39.2, align 8
%arrayidx.i37.1 = getelementptr i8, ptr %agg.result, i64 24
store double %mul.2.1, ptr %arrayidx.i37.1, align 8
%arrayidx.i39.1.1 = getelementptr i8, ptr %agg.result, i64 32
%add.ptr.i41.1.1.1 = getelementptr i8, ptr null, i64 16
%2 = load double, ptr %add.ptr.i41.1.1.1, align 8
%mul.1.1.1 = fmul double %2, 1.000000e+00
%mul.2.1.1 = fmul double 0.000000e+00, %mul.1.1.1
store double %mul.2.1.1, ptr %arrayidx.i39.1.1, align 8
%arrayidx.i39.2.1 = getelementptr i8, ptr %agg.result, i64 40
%mul.1.2.1 = fmul double %1, 0.000000e+00
%mul.2.2.1 = fmul double 0.000000e+00, %mul.1.2.1
store double %mul.2.2.1, ptr %arrayidx.i39.2.1, align 8
%arrayidx.i37.2 = getelementptr i8, ptr %agg.result, i64 48
store double %mul.2.2, ptr %arrayidx.i37.2, align 8
%arrayidx.i39.1.2 = getelementptr i8, ptr %agg.result, i64 56
store double %mul.2.2.1, ptr %arrayidx.i39.1.2, align 8
%arrayidx.i39.2.2 = getelementptr i8, ptr %agg.result, i64 64
%mul.1.2.2 = fmul double 1.000000e+00, 0.000000e+00
%mul.2.2.2 = fmul double 0.000000e+00, %mul.1.2.2
store double %mul.2.2.2, ptr %arrayidx.i39.2.2, align 8
ret void
}
; uselistorder directives
uselistorder ptr null, { 1, 2, 3, 4, 5, 0 }
attributes #0 = { "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+za64rs,+zba,+zbb,+zbs,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicsr,+zihintpause,+zihpm,+zkt,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-b,-e,-experimental-smctr,-experimental-smmpm,-experimental-smnpm,-experimental-ssctr,-experimental-ssnpm,-experimental-sspm,-experimental-supm,-experimental-zacas,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-h,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zaamo,-zabha,-zalrsc,-zama16b,-zawrs,-zbc,-zbkb,-zbkc,-zbkx,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-zicond,-zifencei,-zihintntl,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbb,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } |
Fixed in b74e09c |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At a high level, why is this a profitable thing to do? The structure here assumes to be assuming (via getRegUsageForType) that all fixed vectors types are legalized by splitting to m1. This is not true. The actual lowering for your example here uses an m4 load. Why is it reasonable to cost as if we legal splitting when we're not?
define i64 @test(ptr %p) {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 6, e64, m4, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI0_0)
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vle16.v v12, (a0)
; CHECK-NEXT: vrgatherei16.vv v16, v8, v12
; CHECK-NEXT: li a0, 42
; CHECK-NEXT: vmul.vx v8, v16, a0
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
%ld = load <6 x i64>, ptr %p, align 4
%shuffle = shufflevector <6 x i64> %ld, <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
%mul = mul <8 x i64> %shuffle, <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
%sum = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %mul)
ret i64 %sum
}
Unless you have a clear answer to the above, and I'm just missing something obvious, I think this change should be reverted.
@@ -3276,8 +3308,9 @@ class BoUpSLP { | |||
SmallVectorImpl<Value *> *AltScalars = nullptr) const; | |||
|
|||
/// Return true if this is a non-power-of-2 node. | |||
bool isNonPowOf2Vec() const { | |||
bool IsNonPowerOf2 = !has_single_bit(Scalars.size()); | |||
bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Style wise, I object to this particularly part of the change. Having a vector which can be split into power-of-two pieces (which I think is what you're doing here?), is not the same as that vector not being a power of two. You can adjust the callers to allow this special case if you want, but please revert the API change here. It makes the code markedly less readable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This whole non-power-of-2 breaks everything and I said it before. This is a temporary solution, so I don't see an issue here. It will be removed completely once full non-power-of-2 support is landed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The quality of intermediate code states matter. I am specifically providing feedback as a reviewer, and asking you to make a style change. Please do so.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I disagree
Looks like it requires to use a new TTI entry instead of this one, since it does not work as expected for RISCV. I will add it. |
Unless I'm missing something, simple querying the cost of the <6 x i64> type via the existing interfaces should give correct costs on RISCV. I don't know if the work for that has been done for other targets, but I do not see a need for a new TTI call here. |
Not quite. getNumberOfParts() transforms this type to <8 x i64> and e.g. returns 4 registers instead of 3 for x86. |
After some thought decided to revert the patch. I agree, that it does not work the way it is intended. |
Patch adds basic support for non-power-of-2 number of elements in
operands. The patch still requires that this number addresses whole
registers.