[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands. #106449

alexey-bataev · 2024-08-28T20:24:17Z

Patch adds basic support for non-power-of-2 number of elements in
operands. The patch still requires that this number addresses whole
registers.

Created using spr 1.3.5

llvmbot · 2024-08-28T20:24:50Z

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Patch adds basic support for non-power-of-2 number of elements in
operands. The patch still requires that this number addresses whole
registers.

Full diff: https://github.com/llvm/llvm-project/pull/106449.diff

2 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+67-28)
(modified) llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll (+5-9)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ef5ae9a1a9ccc6..248a7180d9c385 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
                               VF * getNumElements(ScalarTy));
 }
 
+/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
+/// which forms type, which splits by \p TTI into whole vector types during
+/// legalization.
+static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
+                                              Type *Ty, unsigned Sz) {
+  if (!isValidElementType(Ty))
+    return PowerOf2Ceil(Sz);
+  // Find the number of elements, which forms full vectors.
+  const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
+  if (NumParts == 0 || NumParts == Sz)
+    return PowerOf2Ceil(Sz);
+  return PowerOf2Ceil(divideCeil(Sz, NumParts)) * NumParts;
+}
+
 static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
                                                    SmallVectorImpl<int> &Mask) {
   // The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -1224,6 +1238,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
          (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
 }
 
+/// Returns true if widened type of \p Ty elements with size \p Sz represents
+/// full vector type, i.e. adding extra element results in extra parts upon type
+/// legalization.
+static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty,
+                               unsigned Sz) {
+  if (Sz <= 1)
+    return false;
+  if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
+    return false;
+  if (has_single_bit(Sz))
+    return true;
+  const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
+  return NumParts > 0 && NumParts != Sz && has_single_bit(Sz / NumParts) &&
+         Sz % NumParts == 0;
+}
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -2455,7 +2485,9 @@ class BoUpSLP {
         }
         // TODO: Check if we can remove a check for non-power-2 number of
         // scalars after full support of non-power-2 vectorization.
-        return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
+        return UniqueValues.size() != 2 &&
+               hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(),
+                                  UniqueValues.size());
       };
 
       // If the initial strategy fails for any of the operand indexes, then we
@@ -3246,8 +3278,9 @@ class BoUpSLP {
                           SmallVectorImpl<Value *> *AltScalars = nullptr) const;
 
     /// Return true if this is a non-power-of-2 node.
-    bool isNonPowOf2Vec() const {
-      bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
+    bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
+      bool IsNonPowerOf2 = !hasFullVectorsOnly(
+          TTI, getValueType(Scalars.front()), Scalars.size());
       assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
              "Reshuffling not supported with non-power-of-2 vectors yet.");
       return IsNonPowerOf2;
@@ -3425,7 +3458,7 @@ class BoUpSLP {
 
     if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
-      assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
+      assert((!Last->isNonPowOf2Vec(*TTI) || Last->ReorderIndices.empty()) &&
              "Reordering isn't implemented for non-power-of-2 nodes yet");
     }
     return Last;
@@ -4331,7 +4364,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
   if (!isValidElementType(ScalarTy))
     return std::nullopt;
   auto *VecTy = getWidenedType(ScalarTy, NumScalars);
-  int NumParts = TTI->getNumberOfParts(VecTy);
+  int NumParts = TTI->getRegUsageForType(VecTy);
   if (NumParts == 0 || NumParts >= NumScalars)
     NumParts = 1;
   SmallVector<int> ExtractMask;
@@ -4703,7 +4736,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   // Check the order of pointer operands or that all pointers are the same.
   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
   // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-  if (!Order.empty() && !has_single_bit(VL.size())) {
+  if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz)) {
     assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
                                    "supported with VectorizeNonPowerOf2");
     return LoadsState::Gather;
@@ -4761,7 +4794,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
         (IsAnyPointerUsedOutGraph ||
          ((Sz > MinProfitableStridedLoads ||
            (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
-            has_single_bit(AbsoluteDiff))) &&
+            hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) &&
           AbsoluteDiff > Sz) ||
          *Diff == -(static_cast<int>(Sz) - 1))) {
       int Stride = *Diff / static_cast<int>(Sz - 1);
@@ -5102,7 +5135,7 @@ static bool areTwoInsertFromSameBuildVector(
 std::optional<BoUpSLP::OrdersType>
 BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
   // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
-  if (TE.isNonPowOf2Vec())
+  if (TE.isNonPowOf2Vec(*TTI))
     return std::nullopt;
 
   // No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -5136,8 +5169,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       }
     }
     if (Sz == 2 && TE.getVectorFactor() == 4 &&
-        TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
-                                             2 * TE.getVectorFactor())) == 1)
+        TTI->getRegUsageForType(getWidenedType(TE.Scalars.front()->getType(),
+                                               2 * TE.getVectorFactor())) == 1)
       return std::nullopt;
     if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
                                                      Sz)) {
@@ -5491,7 +5524,7 @@ void BoUpSLP::reorderTopToBottom() {
 
   // Reorder the graph nodes according to their vectorization factor.
   for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
-       VF /= 2) {
+       VF -= 2) {
     auto It = VFToOrderedEntries.find(VF);
     if (It == VFToOrderedEntries.end())
       continue;
@@ -5671,7 +5704,7 @@ bool BoUpSLP::canReorderOperands(
     ArrayRef<TreeEntry *> ReorderableGathers,
     SmallVectorImpl<TreeEntry *> &GatherOps) {
   // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-  if (UserTE->isNonPowOf2Vec())
+  if (UserTE->isNonPowOf2Vec(*TTI))
     return false;
 
   for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
@@ -5846,7 +5879,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
         const auto AllowsReordering = [&](const TreeEntry *TE) {
           // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-          if (TE->isNonPowOf2Vec())
+          if (TE->isNonPowOf2Vec(*TTI))
             return false;
           if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
               (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -6505,7 +6538,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   case Instruction::ExtractElement: {
     bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
     // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
-    if (!has_single_bit(VL.size()))
+    if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size()))
       return TreeEntry::NeedToGather;
     if (Reuse || !CurrentOrder.empty())
       return TreeEntry::Vectorize;
@@ -6915,7 +6948,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       ReuseShuffleIndices.clear();
     } else {
       // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
-      if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
+      if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI)) {
         LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
                              "for nodes with padding.\n");
         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -6928,7 +6961,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                                    return isa<UndefValue>(V) ||
                                                           !isConstant(V);
                                                  })) ||
-          !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
+          !hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(),
+                              NumUniqueScalarValues)) {
         if (DoNotFail && UniquePositions.size() > 1 &&
             NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
             all_of(UniqueValues, [=](Value *V) {
@@ -6936,7 +6970,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                      areAllUsersVectorized(cast<Instruction>(V),
                                            UserIgnoreList);
             })) {
-          unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
+          // Find the number of elements, which forms full vectors.
+          unsigned PWSz = getFullVectorNumberOfElements(
+              *TTI, UniqueValues.front()->getType(), UniqueValues.size());
           if (PWSz == VL.size()) {
             ReuseShuffleIndices.clear();
           } else {
@@ -9147,7 +9183,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     }
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
     auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
-    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+    unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
     unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9164,7 +9200,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     }
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
     auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
-    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+    unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
     unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9682,7 +9718,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     unsigned const NumElts = SrcVecTy->getNumElements();
     unsigned const NumScalars = VL.size();
 
-    unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
+    unsigned NumOfParts = TTI->getRegUsageForType(SrcVecTy);
 
     SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
     unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -10911,7 +10947,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           // Keep original scalar if number of externally used instructions in
           // the same entry is not power of 2. It may help to do some extra
           // vectorization for now.
-          KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
+          KeepScalar =
+              ScalarUsesCount <= 1 ||
+              !hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount);
         }
         if (KeepScalar) {
           ExternalUsesAsOriginalScalar.insert(EU.Scalar);
@@ -11602,13 +11640,14 @@ BoUpSLP::isGatherShuffledEntry(
   if (TE == VectorizableTree.front().get())
     return {};
   // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
-  if (TE->isNonPowOf2Vec())
+  if (TE->isNonPowOf2Vec(*TTI))
     return {};
   Mask.assign(VL.size(), PoisonMaskElem);
   assert(TE->UserTreeIndices.size() == 1 &&
          "Expected only single user of the gather node.");
-  assert(VL.size() % NumParts == 0 &&
-         "Number of scalars must be divisible by NumParts.");
+  // Number of scalars must be divisible by NumParts.
+  if (VL.size() % NumParts != 0)
+    return {};
   unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
   SmallVector<std::optional<TTI::ShuffleKind>> Res;
   for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -12744,7 +12783,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
   SmallVector<SmallVector<const TreeEntry *>> Entries;
   Type *OrigScalarTy = GatheredScalars.front()->getType();
   auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
-  unsigned NumParts = TTI->getNumberOfParts(VecTy);
+  unsigned NumParts = TTI->getRegUsageForType(VecTy);
   if (NumParts == 0 || NumParts >= GatheredScalars.size())
     NumParts = 1;
   if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
@@ -16030,7 +16069,7 @@ void BoUpSLP::computeMinimumValueSizes() {
                [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
       return 0u;
 
-    unsigned NumParts = TTI->getNumberOfParts(
+    unsigned NumParts = TTI->getRegUsageForType(
         getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
 
     // The maximum bit width required to represent all the values that can be
@@ -16087,7 +16126,7 @@ void BoUpSLP::computeMinimumValueSizes() {
     // use - ignore it.
     if (NumParts > 1 &&
         NumParts ==
-            TTI->getNumberOfParts(getWidenedType(
+            TTI->getRegUsageForType(getWidenedType(
                 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
       return 0u;
 
@@ -16947,7 +16986,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     for (unsigned I = NextInst; I < MaxInst; ++I) {
       unsigned ActualVF = std::min(MaxInst - I, VF);
 
-      if (!has_single_bit(ActualVF))
+      if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF))
         continue;
 
       if (MaxVFOnly && ActualVF < MaxVF)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll
index 54dc33dbc0d00b..c9a3158acdda34 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll
@@ -4,15 +4,11 @@
 define i64 @test(ptr %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
-; CHECK-NEXT:    ret i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

RKSimon · 2024-08-29T11:48:16Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -4331,7 +4364,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
  if (!isValidElementType(ScalarTy))
    return std::nullopt;
  auto *VecTy = getWidenedType(ScalarTy, NumScalars);
-  int NumParts = TTI->getNumberOfParts(VecTy);
+  int NumParts = TTI->getRegUsageForType(VecTy);


Will we always need getNumberOfParts and getRegUsageForType do you think?

In some places - yes. The reshuffling of the buildvector/gather nodes is based on register/parts shuffling. But in other places (where it checks for full register usage etc.) it will be removed later

Created using spr 1.3.5

mstorsjo · 2024-08-31T08:11:50Z

This triggers failed asserts:

int a[2];
int b, c, d, e;
void f() {
  a[b + c * 8] = a[b + c * 8 + 1] = (long long)d + 70912 >> 30;
  a[b + c * 8 + 2] = (long long)e + 70912 >> 30;
  a[b + c * 8 + 3] = (long long)f + 70912 >> 30;
}

Or

int *a;
long long b, d;
long e(long long f) {
  long c = f >> 4;
  return c;
}
void g() {
  long i, j;
  int h;
  {
    long c = d >> 4;
    j = b >> 4;
    i = c;
  }
  h = e(b * 70);
  a[0] = a[1] = i;
  a[2] = j;
  a[3] = h;
}

Both reproduce like this:

$ clang -target i686-linux-gnu -c -O2 repro.c
clang: ../lib/Transforms/Vectorize/SLPVectorizer.cpp:3314: bool llvm::slpvectorizer::BoUpSLP::TreeEntry::isNonPowOf2Vec(const llvm::TargetTransformInfo&) const: Assertion `(!IsNonPowerOf2 || ReuseShuffleIndices.empty()) && "Reshuffling not supported with non-power-of-2 vectors yet."' failed.

alexey-bataev · 2024-08-31T10:29:20Z

Thanks for the report, feel free to revert for now, will fix next week

…ster) number of elements in operands." This reverts commit 6ab07d7. This commit caused failed asserts, see #106449.

mstorsjo · 2024-08-31T12:01:39Z

Thanks for the report, feel free to revert for now, will fix next week

Thanks, I pushed a revert now!

…mber of elements in operands. Patch adds basic support for non-power-of-2 number of elements in operands. The patch still requires that this number addresses whole registers. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #106449

mikaelholmen · 2024-09-02T13:03:02Z

We see crashes also with the updated patch a3ea90f

opt: ../include/llvm/ADT/ArrayRef.h:377: MutableArrayRef<T> llvm::MutableArrayRef<llvm::Value *>::slice(size_t, size_t) const [T = llvm::
Value *]: Assertion `N + M <= this->size() && "Invalid specifier"' failed.

 #8 0x0000559277fec73c llvm::slpvectorizer::BoUpSLP::tryToGatherExtractElements(llvm::SmallVectorImpl<llvm::Value*>&, llvm::SmallVectorImpl<int>&, unsigned int) const (opt+0x597673c)
 #9 0x0000559277feb95f llvm::slpvectorizer::BoUpSLP::findReusedOrderedScalars(llvm::slpvectorizer::BoUpSLP::TreeEntry const&) (opt+0x597595f)
#10 0x0000559277ff550e llvm::slpvectorizer::BoUpSLP::getReorderingData(llvm::slpvectorizer::BoUpSLP::TreeEntry const&, bool) (opt+0x597f50e)
#11 0x0000559277ff8d79 llvm::slpvectorizer::BoUpSLP::reorderTopToBottom() (opt+0x5982d79)
#12 0x000055927807557d (anonymous namespace)::HorizontalReduction::tryToReduce(llvm::slpvectorizer::BoUpSLP&, llvm::DataLayout const&, llvm::TargetTransformInfo*, llvm::TargetLibraryInfo const&) SLPVectorizer.cpp:0:0
#13 0x00005592780546de llvm::SLPVectorizerPass::vectorizeHorReduction(llvm::PHINode*, llvm::Instruction*, llvm::BasicBlock*, llvm::slpvectorizer::BoUpSLP&, llvm::TargetTransformInfo*, llvm::SmallVectorImpl<llvm::WeakTrackingVH>&) (opt+0x59de6de)
#14 0x0000559278054a9b llvm::SLPVectorizerPass::vectorizeRootInstruction(llvm::PHINode*, llvm::Instruction*, llvm::BasicBlock*, llvm::slpvectorizer::BoUpSLP&, llvm::TargetTransformInfo*) (opt+0x59dea9b)
#15 0x0000559278049d00 llvm::SLPVectorizerPass::vectorizeChainsInBlock(llvm::BasicBlock*, llvm::slpvectorizer::BoUpSLP&) (opt+0x59d3d00)
#16 0x00005592780469c0 llvm::SLPVectorizerPass::runImpl(llvm::Function&, llvm::ScalarEvolution*, llvm::TargetTransformInfo*, llvm::TargetLibraryInfo*, llvm::AAResults*, llvm::LoopInfo*, llvm::DominatorTree*, llvm::AssumptionCache*, llvm::DemandedBits*, llvm::OptimizationRemarkEmitter*) (opt+0x59d09c0)

Unfortunately I don't have a reproducer to share at the moment. I'll try to extract one but not sure I'll succeed.

lukel97 · 2024-09-03T08:35:27Z

I'm hitting a separate assertion failure when building SPEC CPU 2017 for rva22u64_v at O3:

Assertion failed: (VectorizeNonPowerOf2 && "non-power-of-2 number of loads only " "supported with VectorizeNonPowerOf2"), function canVectorizeLoads, file SLPVectorizer.cpp, line 4771.

The reduced case below crashes with ./build.release/bin/opt --passes=slp-vectorizer -disable-output reduced.ll

target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"

define void @_ZNK6dealii24TensorProductPolynomialsILi3EE17compute_grad_gradEjRKNS_5PointILi3EEE(ptr %agg.result) #0 personality ptr null {
entry:
  %0 = load double, ptr null, align 8
  %mul.1 = fmul double %0, 0.000000e+00
  %arrayidx.i39.1 = getelementptr i8, ptr %agg.result, i64 8
  %add.ptr.i41.1.1 = getelementptr i8, ptr null, i64 8
  %1 = load double, ptr %add.ptr.i41.1.1, align 8
  %mul.1.1 = fmul double %1, 0.000000e+00
  %mul.2.1 = fmul double 0.000000e+00, %mul.1.1
  store double %mul.2.1, ptr %arrayidx.i39.1, align 8
  %arrayidx.i39.2 = getelementptr i8, ptr %agg.result, i64 16
  %mul.1.2 = fmul double %0, 0.000000e+00
  %mul.2.2 = fmul double 0.000000e+00, %mul.1.2
  store double %mul.2.2, ptr %arrayidx.i39.2, align 8
  %arrayidx.i37.1 = getelementptr i8, ptr %agg.result, i64 24
  store double %mul.2.1, ptr %arrayidx.i37.1, align 8
  %arrayidx.i39.1.1 = getelementptr i8, ptr %agg.result, i64 32
  %add.ptr.i41.1.1.1 = getelementptr i8, ptr null, i64 16
  %2 = load double, ptr %add.ptr.i41.1.1.1, align 8
  %mul.1.1.1 = fmul double %2, 1.000000e+00
  %mul.2.1.1 = fmul double 0.000000e+00, %mul.1.1.1
  store double %mul.2.1.1, ptr %arrayidx.i39.1.1, align 8
  %arrayidx.i39.2.1 = getelementptr i8, ptr %agg.result, i64 40
  %mul.1.2.1 = fmul double %1, 0.000000e+00
  %mul.2.2.1 = fmul double 0.000000e+00, %mul.1.2.1
  store double %mul.2.2.1, ptr %arrayidx.i39.2.1, align 8
  %arrayidx.i37.2 = getelementptr i8, ptr %agg.result, i64 48
  store double %mul.2.2, ptr %arrayidx.i37.2, align 8
  %arrayidx.i39.1.2 = getelementptr i8, ptr %agg.result, i64 56
  store double %mul.2.2.1, ptr %arrayidx.i39.1.2, align 8
  %arrayidx.i39.2.2 = getelementptr i8, ptr %agg.result, i64 64
  %mul.1.2.2 = fmul double 1.000000e+00, 0.000000e+00
  %mul.2.2.2 = fmul double 0.000000e+00, %mul.1.2.2
  store double %mul.2.2.2, ptr %arrayidx.i39.2.2, align 8
  ret void
}

; uselistorder directives
uselistorder ptr null, { 1, 2, 3, 4, 5, 0 }

attributes #0 = { "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+za64rs,+zba,+zbb,+zbs,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicsr,+zihintpause,+zihpm,+zkt,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-b,-e,-experimental-smctr,-experimental-smmpm,-experimental-smnpm,-experimental-ssctr,-experimental-ssnpm,-experimental-sspm,-experimental-supm,-experimental-zacas,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-h,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zaamo,-zabha,-zalrsc,-zama16b,-zawrs,-zbc,-zbkb,-zbkc,-zbkx,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-zicond,-zifencei,-zihintntl,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbb,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }

alexey-bataev · 2024-09-03T13:28:48Z

Fixed in b74e09c

preames

At a high level, why is this a profitable thing to do? The structure here assumes to be assuming (via getRegUsageForType) that all fixed vectors types are legalized by splitting to m1. This is not true. The actual lowering for your example here uses an m4 load. Why is it reasonable to cost as if we legal splitting when we're not?

define i64 @test(ptr %p) {
; CHECK-LABEL: test:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 6, e64, m4, ta, ma
; CHECK-NEXT:    vle64.v v8, (a0)
; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
; CHECK-NEXT:    addi a0, a0, %lo(.LCPI0_0)
; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT:    vle16.v v12, (a0)
; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
; CHECK-NEXT:    li a0, 42
; CHECK-NEXT:    vmul.vx v8, v16, a0
; CHECK-NEXT:    vmv.s.x v12, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v12
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %ld = load <6 x i64>, ptr %p, align 4
  %shuffle = shufflevector <6 x i64> %ld, <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
  %mul = mul <8 x i64> %shuffle, <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
  %sum = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %mul)
  ret i64 %sum
}

Unless you have a clear answer to the above, and I'm just missing something obvious, I think this change should be reverted.

preames · 2024-09-03T16:05:49Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -3276,8 +3308,9 @@ class BoUpSLP {
                          SmallVectorImpl<Value *> *AltScalars = nullptr) const;

    /// Return true if this is a non-power-of-2 node.
-    bool isNonPowOf2Vec() const {
-      bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
+    bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {


Style wise, I object to this particularly part of the change. Having a vector which can be split into power-of-two pieces (which I think is what you're doing here?), is not the same as that vector not being a power of two. You can adjust the callers to allow this special case if you want, but please revert the API change here. It makes the code markedly less readable.

This whole non-power-of-2 breaks everything and I said it before. This is a temporary solution, so I don't see an issue here. It will be removed completely once full non-power-of-2 support is landed

The quality of intermediate code states matter. I am specifically providing feedback as a reviewer, and asking you to make a style change. Please do so.

alexey-bataev · 2024-09-03T16:37:15Z

At a high level, why is this a profitable thing to do? The structure here assumes to be assuming (via getRegUsageForType) that all fixed vectors types are legalized by splitting to m1. This is not true. The actual lowering for your example here uses an m4 load. Why is it reasonable to cost as if we legal splitting when we're not?
define i64 @test(ptr %p) {
; CHECK-LABEL: test:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 6, e64, m4, ta, ma
; CHECK-NEXT:    vle64.v v8, (a0)
; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
; CHECK-NEXT:    addi a0, a0, %lo(.LCPI0_0)
; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT:    vle16.v v12, (a0)
; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
; CHECK-NEXT:    li a0, 42
; CHECK-NEXT:    vmul.vx v8, v16, a0
; CHECK-NEXT:    vmv.s.x v12, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v12
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %ld = load <6 x i64>, ptr %p, align 4
  %shuffle = shufflevector <6 x i64> %ld, <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
  %mul = mul <8 x i64> %shuffle, <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
  %sum = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %mul)
  ret i64 %sum
}
Unless you have a clear answer to the above, and I'm just missing something obvious, I think this change should be reverted.

Looks like it requires to use a new TTI entry instead of this one, since it does not work as expected for RISCV. I will add it.

preames · 2024-09-03T16:54:19Z

Looks like it requires to use a new TTI entry instead of this one, since it does not work as expected for RISCV. I will add it.

Unless I'm missing something, simple querying the cost of the <6 x i64> type via the existing interfaces should give correct costs on RISCV. I don't know if the work for that has been done for other targets, but I do not see a need for a new TTI call here.

alexey-bataev · 2024-09-03T17:04:28Z

Looks like it requires to use a new TTI entry instead of this one, since it does not work as expected for RISCV. I will add it.

Unless I'm missing something, simple querying the cost of the <6 x i64> type via the existing interfaces should give correct costs on RISCV. I don't know if the work for that has been done for other targets, but I do not see a need for a new TTI call here.

Not quite. getNumberOfParts() transforms this type to <8 x i64> and e.g. returns 4 registers instead of 3 for x86.

alexey-bataev · 2024-09-03T18:05:05Z

At a high level, why is this a profitable thing to do? The structure here assumes to be assuming (via getRegUsageForType) that all fixed vectors types are legalized by splitting to m1. This is not true. The actual lowering for your example here uses an m4 load. Why is it reasonable to cost as if we legal splitting when we're not?
define i64 @test(ptr %p) {
; CHECK-LABEL: test:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 6, e64, m4, ta, ma
; CHECK-NEXT:    vle64.v v8, (a0)
; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
; CHECK-NEXT:    addi a0, a0, %lo(.LCPI0_0)
; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT:    vle16.v v12, (a0)
; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
; CHECK-NEXT:    li a0, 42
; CHECK-NEXT:    vmul.vx v8, v16, a0
; CHECK-NEXT:    vmv.s.x v12, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v12
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %ld = load <6 x i64>, ptr %p, align 4
  %shuffle = shufflevector <6 x i64> %ld, <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
  %mul = mul <8 x i64> %shuffle, <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
  %sum = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %mul)
  ret i64 %sum
}
Unless you have a clear answer to the above, and I'm just missing something obvious, I think this change should be reverted.

After some thought decided to revert the patch. I agree, that it does not work the way it is intended.

[𝘀𝗽𝗿] initial version

b61ca46

Created using spr 1.3.5

llvmbot added vectorizers llvm:transforms labels Aug 28, 2024

alexey-bataev requested a review from RKSimon August 28, 2024 20:24

RKSimon reviewed Aug 28, 2024

View reviewed changes

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Show resolved Hide resolved

RKSimon reviewed Aug 29, 2024

View reviewed changes

RKSimon approved these changes Aug 30, 2024

View reviewed changes

Rebase

a5d18bb

Created using spr 1.3.5

alexey-bataev merged commit 6ab07d7 into main Aug 30, 2024
4 of 5 checks passed

alexey-bataev deleted the users/alexey-bataev/spr/slpinitial-support-for-non-power-of-2-but-still-whole-register-number-of-elements-in-operands branch August 30, 2024 18:50

mstorsjo added a commit that referenced this pull request Aug 31, 2024

Revert "[SLP]Initial support for non-power-of-2 (but still whole regi…

9e86d4f

…ster) number of elements in operands." This reverts commit 6ab07d7. This commit caused failed asserts, see #106449.

dtcxzyw mentioned this pull request Aug 31, 2024

clang crashes on valid code at -Os on x86_64-linux-gnu: Assertion `(!IsNonPowerOf2 || ReuseShuffleIndices.empty()) && "Reshuffling not supported with non-power-of-2 vectors yet."' failed #106845

Closed

preames reviewed Sep 3, 2024

View reviewed changes

preames mentioned this pull request Sep 3, 2024

[SLP] Enable reordering for non-power-of-two vectors #106638

Merged

preames mentioned this pull request Sep 5, 2024

[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands. #107273

Merged

[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands. #106449

[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands. #106449

Uh oh!

Conversation

alexey-bataev commented Aug 28, 2024

Uh oh!

llvmbot commented Aug 28, 2024

Uh oh!

Uh oh!

RKSimon Aug 29, 2024

Choose a reason for hiding this comment

Uh oh!

alexey-bataev Aug 29, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

mstorsjo commented Aug 31, 2024

Uh oh!

alexey-bataev commented Aug 31, 2024

Uh oh!

mstorsjo commented Aug 31, 2024

Uh oh!

mikaelholmen commented Sep 2, 2024

Uh oh!

lukel97 commented Sep 3, 2024

Uh oh!

alexey-bataev commented Sep 3, 2024

Uh oh!

preames left a comment

Choose a reason for hiding this comment

Uh oh!

preames Sep 3, 2024

Choose a reason for hiding this comment

Uh oh!

alexey-bataev Sep 3, 2024

Choose a reason for hiding this comment

Uh oh!

preames Sep 3, 2024

Choose a reason for hiding this comment

Uh oh!

alexey-bataev Sep 3, 2024

Choose a reason for hiding this comment

Uh oh!

alexey-bataev commented Sep 3, 2024

Uh oh!

preames commented Sep 3, 2024

Uh oh!

alexey-bataev commented Sep 3, 2024

Uh oh!

alexey-bataev commented Sep 3, 2024

Uh oh!

Uh oh!