[SLP] NFC. Replace TreeEntry::setOperandsInOrder with VLOperands. #113880

HanKuanChen · 2024-10-28T08:20:31Z

To reduce repeated code, TreeEntry::setOperandsInOrder will be replaced by VLOperands.
Arg_size will be provided to make sure other operands will not be reorderd when VL[0] is IntrinsicInst (because APO is a boolean value).
In addition, BoUpSLP::reorderInputsAccordingToOpcode will also be removed since it is simple.

llvmbot · 2024-10-28T08:21:06Z

@llvm/pr-subscribers-llvm-transforms

Author: Han-Kuan Chen (HanKuanChen)

Changes

To reduce repeated code, TreeEntry::setOperandsInOrder will be replaced by VLOperands.
Arg_size will be provided to make sure other operands will not be reorderd when VL[0] is IntrinsicInst (because APO is a boolean value).
In addition, BoUpSLP::reorderInputsAccordingToOpcode will also be removed since it is simple.

Full diff: https://github.com/llvm/llvm-project/pull/113880.diff

1 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+51-108)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2afd02dae3a8b8..14220efb65a195 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1948,6 +1948,9 @@ class BoUpSLP {
 
     /// A vector of operand vectors.
     SmallVector<OperandDataVec, 4> OpsVec;
+    /// When VL[0] is IntrinsicInst, Arg_size is CallBase::arg_size. When VL[0]
+    /// is not IntrinsicInst, Arg_size is User::getNumOperands.
+    unsigned Arg_size;
 
     const TargetLibraryInfo &TLI;
     const DataLayout &DL;
@@ -2337,10 +2340,11 @@ class BoUpSLP {
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
       assert(isa<Instruction>(VL[0]) && "Expected instruction");
+      unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+      // IntrinsicInst::isCommutative returns true if swapping the first "two"
+      // arguments to the intrinsic produces the same result.
       constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = isa<IntrinsicInst>(VL[0])
-                                 ? IntrinsicNumOperands
-                                 : cast<Instruction>(VL[0])->getNumOperands();
+      Arg_size = isa<IntrinsicInst>(VL[0]) ? IntrinsicNumOperands : NumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -2366,7 +2370,7 @@ class BoUpSLP {
     }
 
     /// \returns the number of operands.
-    unsigned getNumOperands() const { return OpsVec.size(); }
+    unsigned getNumOperands() const { return Arg_size; }
 
     /// \returns the number of lanes.
     unsigned getNumLanes() const { return OpsVec[0].size(); }
@@ -3059,13 +3063,6 @@ class BoUpSLP {
                            SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
                            8> &GatheredLoads);
 
-  /// Reorder commutative or alt operands to get better probability of
-  /// generating vectorized code.
-  static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const BoUpSLP &R);
-
   /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
   /// users of \p TE and collects the stores. It returns the map from the store
   /// pointers to the collected stores.
@@ -3256,24 +3253,6 @@ class BoUpSLP {
       copy(OpVL, Operands[OpIdx].begin());
     }
 
-    /// Set the operands of this bundle in their original order.
-    void setOperandsInOrder() {
-      assert(Operands.empty() && "Already initialized?");
-      auto *I0 = cast<Instruction>(Scalars[0]);
-      Operands.resize(I0->getNumOperands());
-      unsigned NumLanes = Scalars.size();
-      for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
-           OpIdx != NumOperands; ++OpIdx) {
-        Operands[OpIdx].resize(NumLanes);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          auto *I = cast<Instruction>(Scalars[Lane]);
-          assert(I->getNumOperands() == NumOperands &&
-                 "Expected same number of operands");
-          Operands[OpIdx][Lane] = I->getOperand(OpIdx);
-        }
-      }
-    }
-
     /// Reorders operands of the node to the given mask \p Mask.
     void reorderOperands(ArrayRef<int> Mask) {
       for (ValueList &Operand : Operands)
@@ -8294,7 +8273,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    {}, CurrentOrder);
       LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
 
-      TE->setOperandsInOrder();
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
       buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
       return;
     }
@@ -8315,27 +8296,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         else
           LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
-        TE->setOperandsInOrder();
         break;
       case TreeEntry::StridedVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
-        TE->setOperandsInOrder();
         LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
         break;
       case TreeEntry::ScatterVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices);
-        TE->setOperandsInOrder();
-        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
         break;
       case TreeEntry::CombinedVectorize:
       case TreeEntry::NeedToGather:
         llvm_unreachable("Unexpected loads state.");
       }
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      if (State == TreeEntry::ScatterVectorize)
+        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
       return;
     }
     case Instruction::ZExt:
@@ -8373,8 +8355,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       if (ShuffleOrOp == Instruction::Trunc) {
         ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
@@ -8401,12 +8385,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       ValueList Left, Right;
+      VLOperands Ops(VL, *this);
       if (cast<CmpInst>(VL0)->isCommutative()) {
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
         assert(P0 == CmpInst::getSwappedPredicate(P0) &&
                "Commutative Predicate mismatch");
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+        Ops.reorder();
+        Left = Ops.getVL(0);
+        Right = Ops.getVL(1);
       } else {
         // Collect operands - commute if it uses the swapped predicate.
         for (Value *V : VL) {
@@ -8462,20 +8449,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
 
+      VLOperands Ops(VL, *this);
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
-      if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
-        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        return;
-      }
-
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      if (isa<BinaryOperator>(VL0) && isCommutative(VL0))
+        Ops.reorder();
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
@@ -8540,7 +8521,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         fixupOrderingIndices(CurrentOrder);
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndices, CurrentOrder);
-      TE->setOperandsInOrder();
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
       buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
       if (Consecutive)
         LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
@@ -8556,46 +8539,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndices);
+      VLOperands Ops(VL, *this);
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
-      if (isCommutative(VL0)) {
-        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        SmallVector<ValueList> Operands;
-        for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
-          Operands.emplace_back();
-          if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
-            continue;
-          for (Value *V : VL) {
-            auto *CI2 = cast<CallInst>(V);
-            Operands.back().push_back(CI2->getArgOperand(I));
-          }
-          TE->setOperand(I, Operands.back());
-        }
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
-          if (Operands[I - 2].empty())
-            continue;
-          buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
-        }
-        return;
-      }
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
+      if (isCommutative(VL0))
+        Ops.reorder();
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(CI->arg_size())) {
         // For scalar operands no need to create an entry since no need to
         // vectorize it.
         if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
           continue;
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (Value *V : VL) {
-          auto *CI2 = cast<CallInst>(V);
-          Operands.push_back(CI2->getArgOperand(I));
-        }
-        buildTree_rec(Operands, Depth + 1, {TE, I});
+        buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       }
       return;
     }
@@ -8604,14 +8560,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
+      VLOperands Ops(VL, *this);
       // Reorder operands if reordering would enable vectorization.
       auto *CI = dyn_cast<CmpInst>(VL0);
       if (isa<BinaryOperator>(VL0) || CI) {
-        ValueList Left, Right;
         if (!CI || all_of(VL, [](Value *V) {
               return cast<CmpInst>(V)->isCommutative();
             })) {
-          reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+          Ops.reorder();
         } else {
           auto *MainCI = cast<CmpInst>(S.MainOp);
           auto *AltCI = cast<CmpInst>(S.AltOp);
@@ -8619,6 +8575,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           CmpInst::Predicate AltP = AltCI->getPredicate();
           assert(MainP != AltP &&
                  "Expected different main/alternate predicates.");
+          ValueList Left, Right;
           // Collect operands - commute if it uses the swapped predicate or
           // alternate operation.
           for (Value *V : VL) {
@@ -8636,16 +8593,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             Left.push_back(LHS);
             Right.push_back(RHS);
           }
+          TE->setOperand(0, Left);
+          TE->setOperand(1, Right);
+          buildTree_rec(Left, Depth + 1, {TE, 0});
+          buildTree_rec(Right, Depth + 1, {TE, 1});
+          return;
         }
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        return;
       }
 
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
@@ -13024,21 +12982,6 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
   return Cost;
 }
 
-// Perform operand reordering on the instructions in VL and return the reordered
-// operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const BoUpSLP &R) {
-  if (VL.empty())
-    return;
-  VLOperands Ops(VL, R);
-  // Reorder the operands in place.
-  Ops.reorder();
-  Left = Ops.getVL(0);
-  Right = Ops.getVL(1);
-}
-
 Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
   auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
   if (Res)

llvmbot · 2024-10-28T08:21:06Z

@llvm/pr-subscribers-vectorizers

Author: Han-Kuan Chen (HanKuanChen)

Changes

To reduce repeated code, TreeEntry::setOperandsInOrder will be replaced by VLOperands.
Arg_size will be provided to make sure other operands will not be reorderd when VL[0] is IntrinsicInst (because APO is a boolean value).
In addition, BoUpSLP::reorderInputsAccordingToOpcode will also be removed since it is simple.

Full diff: https://github.com/llvm/llvm-project/pull/113880.diff

1 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+51-108)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2afd02dae3a8b8..14220efb65a195 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1948,6 +1948,9 @@ class BoUpSLP {
 
     /// A vector of operand vectors.
     SmallVector<OperandDataVec, 4> OpsVec;
+    /// When VL[0] is IntrinsicInst, Arg_size is CallBase::arg_size. When VL[0]
+    /// is not IntrinsicInst, Arg_size is User::getNumOperands.
+    unsigned Arg_size;
 
     const TargetLibraryInfo &TLI;
     const DataLayout &DL;
@@ -2337,10 +2340,11 @@ class BoUpSLP {
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
       assert(isa<Instruction>(VL[0]) && "Expected instruction");
+      unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+      // IntrinsicInst::isCommutative returns true if swapping the first "two"
+      // arguments to the intrinsic produces the same result.
       constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = isa<IntrinsicInst>(VL[0])
-                                 ? IntrinsicNumOperands
-                                 : cast<Instruction>(VL[0])->getNumOperands();
+      Arg_size = isa<IntrinsicInst>(VL[0]) ? IntrinsicNumOperands : NumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -2366,7 +2370,7 @@ class BoUpSLP {
     }
 
     /// \returns the number of operands.
-    unsigned getNumOperands() const { return OpsVec.size(); }
+    unsigned getNumOperands() const { return Arg_size; }
 
     /// \returns the number of lanes.
     unsigned getNumLanes() const { return OpsVec[0].size(); }
@@ -3059,13 +3063,6 @@ class BoUpSLP {
                            SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
                            8> &GatheredLoads);
 
-  /// Reorder commutative or alt operands to get better probability of
-  /// generating vectorized code.
-  static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const BoUpSLP &R);
-
   /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
   /// users of \p TE and collects the stores. It returns the map from the store
   /// pointers to the collected stores.
@@ -3256,24 +3253,6 @@ class BoUpSLP {
       copy(OpVL, Operands[OpIdx].begin());
     }
 
-    /// Set the operands of this bundle in their original order.
-    void setOperandsInOrder() {
-      assert(Operands.empty() && "Already initialized?");
-      auto *I0 = cast<Instruction>(Scalars[0]);
-      Operands.resize(I0->getNumOperands());
-      unsigned NumLanes = Scalars.size();
-      for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
-           OpIdx != NumOperands; ++OpIdx) {
-        Operands[OpIdx].resize(NumLanes);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          auto *I = cast<Instruction>(Scalars[Lane]);
-          assert(I->getNumOperands() == NumOperands &&
-                 "Expected same number of operands");
-          Operands[OpIdx][Lane] = I->getOperand(OpIdx);
-        }
-      }
-    }
-
     /// Reorders operands of the node to the given mask \p Mask.
     void reorderOperands(ArrayRef<int> Mask) {
       for (ValueList &Operand : Operands)
@@ -8294,7 +8273,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    {}, CurrentOrder);
       LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
 
-      TE->setOperandsInOrder();
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
       buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
       return;
     }
@@ -8315,27 +8296,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         else
           LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
-        TE->setOperandsInOrder();
         break;
       case TreeEntry::StridedVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
-        TE->setOperandsInOrder();
         LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
         break;
       case TreeEntry::ScatterVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices);
-        TE->setOperandsInOrder();
-        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
         break;
       case TreeEntry::CombinedVectorize:
       case TreeEntry::NeedToGather:
         llvm_unreachable("Unexpected loads state.");
       }
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      if (State == TreeEntry::ScatterVectorize)
+        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
       return;
     }
     case Instruction::ZExt:
@@ -8373,8 +8355,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       if (ShuffleOrOp == Instruction::Trunc) {
         ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
@@ -8401,12 +8385,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       ValueList Left, Right;
+      VLOperands Ops(VL, *this);
       if (cast<CmpInst>(VL0)->isCommutative()) {
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
         assert(P0 == CmpInst::getSwappedPredicate(P0) &&
                "Commutative Predicate mismatch");
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+        Ops.reorder();
+        Left = Ops.getVL(0);
+        Right = Ops.getVL(1);
       } else {
         // Collect operands - commute if it uses the swapped predicate.
         for (Value *V : VL) {
@@ -8462,20 +8449,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
 
+      VLOperands Ops(VL, *this);
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
-      if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
-        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        return;
-      }
-
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      if (isa<BinaryOperator>(VL0) && isCommutative(VL0))
+        Ops.reorder();
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
@@ -8540,7 +8521,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         fixupOrderingIndices(CurrentOrder);
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndices, CurrentOrder);
-      TE->setOperandsInOrder();
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
       buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
       if (Consecutive)
         LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
@@ -8556,46 +8539,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndices);
+      VLOperands Ops(VL, *this);
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
-      if (isCommutative(VL0)) {
-        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        SmallVector<ValueList> Operands;
-        for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
-          Operands.emplace_back();
-          if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
-            continue;
-          for (Value *V : VL) {
-            auto *CI2 = cast<CallInst>(V);
-            Operands.back().push_back(CI2->getArgOperand(I));
-          }
-          TE->setOperand(I, Operands.back());
-        }
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
-          if (Operands[I - 2].empty())
-            continue;
-          buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
-        }
-        return;
-      }
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
+      if (isCommutative(VL0))
+        Ops.reorder();
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(CI->arg_size())) {
         // For scalar operands no need to create an entry since no need to
         // vectorize it.
         if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
           continue;
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (Value *V : VL) {
-          auto *CI2 = cast<CallInst>(V);
-          Operands.push_back(CI2->getArgOperand(I));
-        }
-        buildTree_rec(Operands, Depth + 1, {TE, I});
+        buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       }
       return;
     }
@@ -8604,14 +8560,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
+      VLOperands Ops(VL, *this);
       // Reorder operands if reordering would enable vectorization.
       auto *CI = dyn_cast<CmpInst>(VL0);
       if (isa<BinaryOperator>(VL0) || CI) {
-        ValueList Left, Right;
         if (!CI || all_of(VL, [](Value *V) {
               return cast<CmpInst>(V)->isCommutative();
             })) {
-          reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+          Ops.reorder();
         } else {
           auto *MainCI = cast<CmpInst>(S.MainOp);
           auto *AltCI = cast<CmpInst>(S.AltOp);
@@ -8619,6 +8575,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           CmpInst::Predicate AltP = AltCI->getPredicate();
           assert(MainP != AltP &&
                  "Expected different main/alternate predicates.");
+          ValueList Left, Right;
           // Collect operands - commute if it uses the swapped predicate or
           // alternate operation.
           for (Value *V : VL) {
@@ -8636,16 +8593,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             Left.push_back(LHS);
             Right.push_back(RHS);
           }
+          TE->setOperand(0, Left);
+          TE->setOperand(1, Right);
+          buildTree_rec(Left, Depth + 1, {TE, 0});
+          buildTree_rec(Right, Depth + 1, {TE, 1});
+          return;
         }
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        return;
       }
 
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
@@ -13024,21 +12982,6 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
   return Cost;
 }
 
-// Perform operand reordering on the instructions in VL and return the reordered
-// operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const BoUpSLP &R) {
-  if (VL.empty())
-    return;
-  VLOperands Ops(VL, R);
-  // Reorder the operands in place.
-  Ops.reorder();
-  Left = Ops.getVL(0);
-  Right = Ops.getVL(1);
-}
-
 Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
   auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
   if (Res)

alexey-bataev · 2024-10-28T14:44:31Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -1948,6 +1948,9 @@ class BoUpSLP {

    /// A vector of operand vectors.
    SmallVector<OperandDataVec, 4> OpsVec;
+    /// When VL[0] is IntrinsicInst, Arg_size is CallBase::arg_size. When VL[0]
+    /// is not IntrinsicInst, Arg_size is User::getNumOperands.
+    unsigned Arg_size;


Why the original code cannot be reused, why need an extra data member here? Why just getNumOperands() or 2 does not work?

For IntrinsicInst with more than 2 operands, VLOperands::appendOperandsOfVL stores the first two operands but TreeEntry::setOperandsInOrder stores all of the operands.
Only the first two operands should be reorder. But all of the operands can be passed into buildTree_rec.
If we don't use a new data member (Arg_size), either reorder swap non-swappable operands or we need to keep TreeEntry::setOperandsInOrder.

Just mark such operands as non-reordable

You are right. We can use 2 here.

Just mark such operands as non-reordable

Mark operands as non-reordable also requires additional data member to store the information.

alexey-bataev · 2024-10-29T18:09:59Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+    /// reorder can only work on instructions with 2 operands. If we use
+    /// OpsVec.size() here, reorder will swap non-swappable operands (because
+    /// APO is a boolean value).
+    unsigned getNumOperands() const { return 2; }


I don't like this approach. I plan to extend the reordering to support many operands, better to avoid constant value here.

Then we still need arg_size here.
On the other hand, I am curious about how reorder support more operands. Only the first two operands of BinaryOperator, CmpInst and IntrinsicInst are commutative. What kinds of instructions can be supported if we enhance reorder?

if you have add i32 (add p1, p2), (add p3, p4), we can model it as add p1, p2, p3, p4 and reorder operands.

alexey-bataev · 2024-10-31T17:10:14Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

-      unsigned NumOperands = isa<IntrinsicInst>(VL[0])
-                                 ? IntrinsicNumOperands
-                                 : cast<Instruction>(VL[0])->getNumOperands();
+      Arg_size = isa<IntrinsicInst>(VL[0]) ? IntrinsicNumOperands : NumOperands;


Still do not understand why we need Arg_size here. BTW, Better it use NumOperands or ArgSize name

We need ArgSize to make sure reorder only swap the first two operands. After this PR, OpsVec also contains non swappable operands (because we want appendOperandsOfVL can be reused even the operands of VL is not commutative).

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

alexey-bataev · 2024-11-19T21:08:18Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+      if (isa<BinaryOperator>(VL0) && isCommutative(VL0))
+        Ops.reorder();


Better hide it in setOperand() function

I don't think it is a good idea. Current reorder usage is

if (cast<CmpInst>(VL0)->isCommutative()) { // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == CmpInst::getSwappedPredicate(P0) && "Commutative Predicate mismatch"); Ops.reorder();

if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) Ops.reorder();

if (isCommutative(VL0)) Ops.reorder();

auto *CI = dyn_cast<CmpInst>(VL0); if (isa<BinaryOperator>(VL0) || CI) { if (!CI || all_of(VL, [](Value *V) { return cast<CmpInst>(V)->isCommutative(); })) { Ops.reorder();

They use reorder when VL0 is commutative. But the last one determine the condition in another way.
Move reorder into other place does not make the code neater (the last one still exist).

Pass the flag to do the reordering or not

alexey-bataev · 2024-11-21T17:40:49Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

-            auto *CI2 = cast<CallInst>(V);
-            Operands.back().push_back(CI2->getArgOperand(I));
-          }
-          TE->setOperand(I, Operands.back());


What about these operands, >= 2?

What do you mean? They are still in TE. appendOperandsOfVL did not forget them.

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

alexey-bataev · 2024-11-22T10:48:17Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

-          if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
-            continue;


There is no such check in the new code in setOperand. What if one of the 2 first operands is isVectorIntrinsicWithScalarOpAtArg()?

for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
It starts at 2.

The only difference is the new code will store it.
The old code for other arguments which is isVectorIntrinsicWithScalarOpAtArg, setOperand will not store it.
The new code for other arguments which is isVectorIntrinsicWithScalarOpAtArg, setOperand will store it.

But both the old code and the new code will not execute buildTree_rec if the argument is isVectorIntrinsicWithScalarOpAtArg.

Better to avoid calling setOperand for isVectorIntrinsicWithScalarOpAtArg args

But for IntrinsicInst which is NOT commutative, all arguments will be stored into a TreeEntry (and SLP will not call buildTree_rec for those arguments).

If we have to "avoid calling setOperand for isVectorIntrinsicWithScalarOpAtArg args" for commutative IntrinsicInst, we have to do something like the following patch.

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d8c596eb875d..54210cc0b149 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3267,8 +3267,18 @@ private: void setOperand(ArrayRef<Value *> VL, const BoUpSLP &R, bool RequireReorder = false) { VLOperands Ops(VL, R); - if (RequireReorder) + if (RequireReorder) { Ops.reorder(); + if (auto *CI = dyn_cast<CallInst>(VL[0])) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, R.TLI); + for (unsigned I : seq<unsigned>(CI->arg_size())) { + if (isVectorIntrinsicWithScalarOpAtArg(ID, I)) + continue; + setOperand(I, Ops.getVL(I)); + } + return; + } + } for (unsigned I : seq<unsigned>(cast<Instruction>(VL[0])->getNumOperands())) setOperand(I, Ops.getVL(I));

But it does not look elegant.

But for IntrinsicInst which is NOT commutative, all arguments will be stored into a TreeEntry (and SLP will not call buildTree_rec for those arguments).

If we have to "avoid calling setOperand for isVectorIntrinsicWithScalarOpAtArg args" for commutative IntrinsicInst, we have to do something like the following patch.

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d8c596eb875d..54210cc0b149 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3267,8 +3267,18 @@ private: void setOperand(ArrayRef<Value *> VL, const BoUpSLP &R, bool RequireReorder = false) { VLOperands Ops(VL, R); - if (RequireReorder) + if (RequireReorder) { Ops.reorder(); + if (auto *CI = dyn_cast<CallInst>(VL[0])) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, R.TLI); + for (unsigned I : seq<unsigned>(CI->arg_size())) { + if (isVectorIntrinsicWithScalarOpAtArg(ID, I)) + continue; + setOperand(I, Ops.getVL(I)); + } + return; + } + } for (unsigned I : seq<unsigned>(cast<Instruction>(VL[0])->getNumOperands())) setOperand(I, Ops.getVL(I));

But it does not look elegant.

We should maintain correctness where possible and should not produce "noise" entries in the graph. They may cause unexpecteв side effects and cause some extra compiler crashes

alexey-bataev · 2024-11-26T12:17:50Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

        Ops.reorder();
+        if (auto *CI = dyn_cast<CallInst>(VL[0])) {
+          Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, R.TLI);


Need to add a check that ID is actually an intrinsic and avoid this check for non-intrinsics

Also, why this processing is enabled only if RequireReorder is true?

Need to add a check that ID is actually an intrinsic and avoid this check for non-intrinsics

Why do we need to check the ID while the old code does not have to?

Also, why this processing is enabled only if RequireReorder is true?

Because only the commutative intrinsics check it in the old code.

I think we should NOT add isVectorIntrinsicWithScalarOpAtArg check for setOperand (for commutative and non commutative intrinsics). The reason is it will cause some bugs. I will use llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll as an example here.
If we make non commutative intrinsics check isVectorIntrinsicWithScalarOpAtArg, then

%add1 will not be added into TreeEntry (because the operand 1 for powi is scalar operand).

While SLP is trying to do schedule, SLP cannot call DecrUnsched because the operand of llvm.powi is not inside the TreeEntry (it is isVectorIntrinsicWithScalarOpAtArg).

for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); OpIdx != NumOperands; ++OpIdx) if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) DecrUnsched(I);

Eventually it triggers the assert

#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) // Check that all schedulable entities got scheduled for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { ScheduleData *SD = BS->getScheduleData(I); if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies()) assert(SD->IsScheduled && "must be scheduled at this point"); } #endif

Only smul_fix, smul_fix_sat, umul_fix and umul_fix_sat are non commutative intrinsic and also isVectorIntrinsicWithScalarOpAtArg. However, the third operand of intrinsics are Constant, which will not trigger the assert.

I think we should revert 664f2d3.

alexey-bataev · 2024-11-28T16:45:30Z

Why do we need to check the ID while the old code does not have to?

Because the load code explicitly worked only for CallInst

Because only the commutative intrinsics check it in the old code.

The non-commutative intrinsics also need to have a check for isVectorIntrinsicWithScalarOpAtArg

I think we should NOT add isVectorIntrinsicWithScalarOpAtArg check for setOperand

The check should be there or this whole patch should be dropped. We should not create pseudo tree node for the scalar instructions

HanKuanChen · 2024-11-29T09:22:20Z

Why do we need to check the ID while the old code does not have to?

Because the load code explicitly worked only for CallInst

But we have a if (auto *CI = dyn_cast<CallInst>(VL[0])) { check here.

Because only the commutative intrinsics check it in the old code.

The non-commutative intrinsics also need to have a check for isVectorIntrinsicWithScalarOpAtArg

If we add isVectorIntrinsicWithScalarOpAtArg check for non-commutative intrinsics, llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll will fail even without this PR. The bug is not caused by this PR.

I think we should NOT add isVectorIntrinsicWithScalarOpAtArg check for setOperand

The check should be there or this whole patch should be dropped. We should not create pseudo tree node for the scalar instructions

If LLVM has an intrinsic in the future and

the intrinsic is commutative
one of the operand is isVectorIntrinsicWithScalarOpAtArg, and the operand is NOT Constant

SLP will get fail because the operand is NOT in TreeEntry and it will trigger the assert in

#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
  // Check that all schedulable entities got scheduled
  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
    ScheduleData *SD = BS->getScheduleData(I);
    if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
      assert(SD->IsScheduled && "must be scheduled at this point");
  }
#endif

Have 664f2d3 will cause a potential bug in the future.

In addition, the current code already has scalar instructions in TreeEntry because non-commutative intrinsics use setOperandsInOrder (and setOperandsInOrder does not have isVectorIntrinsicWithScalarOpAtArg check).

alexey-bataev · 2024-11-29T13:41:02Z

Why do we need to check the ID while the old code does not have to?

Because the load code explicitly worked only for CallInst

But we have a if (auto *CI = dyn_cast<CallInst>(VL[0])) { check here.

Because only the commutative intrinsics check it in the old code.

The non-commutative intrinsics also need to have a check for isVectorIntrinsicWithScalarOpAtArg

If we add isVectorIntrinsicWithScalarOpAtArg check for non-commutative intrinsics, llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll will fail even without this PR. The bug is not caused by this PR.

I think we should NOT add isVectorIntrinsicWithScalarOpAtArg check for setOperand

The check should be there or this whole patch should be dropped. We should not create pseudo tree node for the scalar instructions

If LLVM has an intrinsic in the future and

the intrinsic is commutative

one of the operand is isVectorIntrinsicWithScalarOpAtArg, and the operand is NOT Constant

SLP will get fail because the operand is NOT in TreeEntry and it will trigger the assert in
#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
  // Check that all schedulable entities got scheduled
  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
    ScheduleData *SD = BS->getScheduleData(I);
    if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
      assert(SD->IsScheduled && "must be scheduled at this point");
  }
#endif
Have 664f2d3 will cause a potential bug in the future.

In addition, the current code already has scalar instructions in TreeEntry because non-commutative intrinsics use setOperandsInOrder (and setOperandsInOrder does not have isVectorIntrinsicWithScalarOpAtArg check).

It is not in tree, it is just set as the operand of the treeentry, but there is no treeentry for such operands. Such operands will end up as just broadcast gather treeentries and won't be scheduled anyway

HanKuanChen · 2024-12-03T10:07:37Z

Why do we need to check the ID while the old code does not have to?

Because the load code explicitly worked only for CallInst

But we have a if (auto *CI = dyn_cast<CallInst>(VL[0])) { check here.

Because only the commutative intrinsics check it in the old code.

The non-commutative intrinsics also need to have a check for isVectorIntrinsicWithScalarOpAtArg

If we add isVectorIntrinsicWithScalarOpAtArg check for non-commutative intrinsics, llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll will fail even without this PR. The bug is not caused by this PR.

I think we should NOT add isVectorIntrinsicWithScalarOpAtArg check for setOperand

The check should be there or this whole patch should be dropped. We should not create pseudo tree node for the scalar instructions

If LLVM has an intrinsic in the future and

the intrinsic is commutative

one of the operand is isVectorIntrinsicWithScalarOpAtArg, and the operand is NOT Constant

SLP will get fail because the operand is NOT in TreeEntry and it will trigger the assert in
#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
  // Check that all schedulable entities got scheduled
  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
    ScheduleData *SD = BS->getScheduleData(I);
    if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
      assert(SD->IsScheduled && "must be scheduled at this point");
  }
#endif
Have 664f2d3 will cause a potential bug in the future.
In addition, the current code already has scalar instructions in TreeEntry because non-commutative intrinsics use setOperandsInOrder (and setOperandsInOrder does not have isVectorIntrinsicWithScalarOpAtArg check).
It is not in tree, it is just set as the operand of the treeentry, but there is no treeentry for such operands. Such operands will end up as just broadcast gather treeentries and won't be scheduled anyway

SLP will call tryScheduleBundle and calculateDependencies. The latter one will calculate the dependency of ScheduleData.

      // Handle def-use chain dependencies.
      for (User *U : BundleMember->Inst->users()) {
        if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
          BundleMember->Dependencies++;
          ScheduleData *DestBundle = UseSD->FirstInBundle;
          if (!DestBundle->IsScheduled)
            BundleMember->incrementUnscheduledDeps(1);
          if (!DestBundle->hasValidDependencies())
            WorkList.push_back(DestBundle);
        }
      }

After that, it will call scheduleBlock, BS->resetSchedule() (it will update ScheduleData.UnscheduledDeps to ScheduleData.Dependencies) and do

  // Do the "real" scheduling.
  while (!ReadyInsts.empty()) {
    ScheduleData *Picked = *ReadyInsts.begin();
    ReadyInsts.erase(ReadyInsts.begin());

    // Move the scheduled instruction(s) to their dedicated places, if not
    // there yet.
    for (ScheduleData *BundleMember = Picked; BundleMember;
         BundleMember = BundleMember->NextInBundle) {
      Instruction *PickedInst = BundleMember->Inst;
      if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
        PickedInst->moveAfter(LastScheduledInst->getPrevNode());
      LastScheduledInst = PickedInst;
    }

    BS->schedule(Picked, ReadyInsts);
  }

Then the assert will be triggered because the UnscheduledDeps of %add1 is 5 (because it has 5 usage) but the second operand (@llvm.powi.f32.i32(float %fp1,i32 %add1)) is NOT in the TreeEntry (if we use isVectorIntrinsicWithScalarOpAtArg to filter that). It will make DecrUnsched cannot be called and the bundle of %add1 cannot do schedule eventually.

          for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
               OpIdx != NumOperands; ++OpIdx)
            if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
              DecrUnsched(I);

Maybe we need to modify BundleMember->Dependencies++ like this?

@@ -16921,9 +16920,14 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
       BundleMember->resetUnscheduledDeps();

       // Handle def-use chain dependencies.
-      for (User *U : BundleMember->Inst->users()) {
-        if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
-          BundleMember->Dependencies++;
+      for (Use &U : BundleMember->Inst->uses()) {
+        User *User = U.getUser();
+        if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(User))) {
+          auto *CI = dyn_cast<CallInst>(User);
+          if (!CI ||
+              !isVectorIntrinsicWithScalarOpAtArg(
+                  getVectorIntrinsicIDForCall(CI, SLP->TLI), U.getOperandNo()))
+            BundleMember->Dependencies++;
           ScheduleData *DestBundle = UseSD->FirstInBundle;
           if (!DestBundle->IsScheduled)
             BundleMember->incrementUnscheduledDeps(1);

To reduce repeated code, TreeEntry::setOperandsInOrder will be replaced by VLOperands. ArgSize will be provided to make sure other operands will not be reorderd when VL[0] is IntrinsicInst (because APO is a boolean value). In addition, BoUpSLP::reorderInputsAccordingToOpcode will also be removed since it is simple.

alexey-bataev

LG, please do not use force push, use stacked PRs

llvm-ci · 2024-12-06T04:26:16Z

LLVM Buildbot has detected a new failure on builder openmp-offload-libc-amdgpu-runtime running on omp-vega20-1 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/9720

Here is the relevant piece of the build log for the reference

Step 7 (Add check check-offload) failure: test (failure)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: sanitizer/kernel_crash_async.c' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 2
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp    -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa -O3 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa -O3 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# RUN: at line 3
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp 2>&1 | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=TRACE
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=TRACE
# RUN: at line 4
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp 2>&1 | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=CHECK
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=CHECK
# .---command stderr------------
# | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c:39:11: error: CHECK: expected string not found in input
# | // CHECK: Kernel {{[0-9]}}: {{.*}} (__omp_offloading_{{.*}}_main_l29)
# |           ^
# | <stdin>:1:1: note: scanning from here
# | Display only launched kernel:
# | ^
# | <stdin>:2:16: note: possible intended match here
# | Kernel 'omp target in main @ 29 (__omp_offloading_802_d8283c2_main_l29)'
# |                ^
# | 
# | Input file: <stdin>
# | Check file: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             1: Display only launched kernel: 
# | check:39'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |             2: Kernel 'omp target in main @ 29 (__omp_offloading_802_d8283c2_main_l29)' 
# | check:39'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | check:39'1                    ?                                                          possible intended match
# |             3: OFFLOAD ERROR: Memory access fault by GPU 1 (agent 0x564b417a8a00) at virtual address (nil). Reasons: Page not present or supervisor privilege, Write access to a read-only page 
# | check:39'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |             4: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations 
# | check:39'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |             5: error: Aborted 
# | check:39'0     ~~~~~~~~~~~~~~~
# | >>>>>>
# `-----------------------------
# error: command failed with exit status: 1

--
...

lukel97 · 2024-12-06T07:41:39Z

Hi, I'm getting a crash after this when building 510.parest_r from SPEC CPU 2017:

Assertion failed: (isa<To>(Val) && "cast<Ty>() argument of incompatible type!"), function cast, file Casting.h, line 578.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: /Users/luke/Developer/llvm-project/build.release/bin/ld.lld --sysroot=/Users/luke/Developer/riscv64-linux-gnu --hash-style=gnu --eh-frame-hdr -m elf64lriscv -X -pie -dynamic-linker /lib/ld-linux-riscv64-lp64d.so.1 ... -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /Users/luke/Developer/riscv64-linux-gnu/lib/crtendS.o /Users/luke/Developer/riscv64-linux-gnu/lib/crtn.o -march=rva23u64 -O3
1.	Running pass "function<eager-inv>(loop-mssa(licm<allowspeculation>),gvn<>,memcpyopt,dse,move-auto-init,mldst-motion<no-split-footer-bb>,loop(indvars,loop-deletion,loop-unroll-full),loop-distribute,loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,infer-alignment,loop-unroll<O3>,transform-warning,sroa<preserve-cfg>,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sccp,instcombine<max-iterations=1;no-verify-fixpoint>,bdce,slp-vectorizer,vector-combine,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,jump-threading)" on module "ld-temp.o"
2.	Running pass "slp-vectorizer" on function "_ZNK6dealii7FE_PolyINS_24TensorProductPolynomialsILi3EEELi3ELi3EE8get_dataENS_11UpdateFlagsERKNS_7MappingILi3ELi3EEERKNS_10QuadratureILi3EEE"
 #0 0x0000000102f62368 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e32368)
 #1 0x0000000102f60330 llvm::sys::RunSignalHandlers() (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e30330)
 #2 0x0000000102f62a0c SignalHandler(int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e32a0c)
 #3 0x000000018d545a24 (/usr/lib/system/libsystem_platform.dylib+0x18046da24)
 #4 0x000000018d515cc0 (/usr/lib/system/libsystem_pthread.dylib+0x18043dcc0)
 #5 0x000000018d421a40 (/usr/lib/system/libsystem_c.dylib+0x180349a40)
 #6 0x000000018d420d30 (/usr/lib/system/libsystem_c.dylib+0x180348d30)
 #7 0x0000000101c99414 llvm::slpvectorizer::BoUpSLP::TreeEntry::setOperand(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP const&, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b69414)
 #8 0x0000000101c8de6c llvm::slpvectorizer::BoUpSLP::buildTree_rec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b5de6c)
 #9 0x0000000101c8f4a4 llvm::slpvectorizer::BoUpSLP::buildTree_rec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b5f4a4)
#10 0x0000000101cd0a68 llvm::SLPVectorizerPass::vectorizeStoreChain(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP&, unsigned int, unsigned int, unsigned int&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba0a68)
#11 0x0000000101cd2ba4 llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::$_0::operator()(std::__1::set<std::__1::pair<unsigned int, int>, llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::StoreDistCompare, std::__1::allocator<std::__1::pair<unsigned int, int>>> const&) const (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba2ba4)
#12 0x0000000101cd1d7c llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba1d7c)
#13 0x0000000101ccdcb8 llvm::SLPVectorizerPass::vectorizeStoreChains(llvm::slpvectorizer::BoUpSLP&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9dcb8)
#14 0x0000000101cccc1c llvm::SLPVectorizerPass::runImpl(llvm::Function&, llvm::ScalarEvolution*, llvm::TargetTransformInfo*, llvm::TargetLibraryInfo*, llvm::AAResults*, llvm::LoopInfo*, llvm::DominatorTree*, llvm::AssumptionCache*, llvm::DemandedBits*, llvm::OptimizationRemarkEmitter*) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9cc1c)
#15 0x0000000101ccc298 llvm::SLPVectorizerPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9c298)
#16 0x0000000102d9fe78 llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c6fe78)
#17 0x0000000102da42e8 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c742e8)
#18 0x0000000102d9f0a4 llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c6f0a4)
#19 0x00000001011bacec llvm::lto::opt(llvm::lto::Config const&, llvm::TargetMachine*, unsigned int, llvm::Module&, bool, llvm::ModuleSummaryIndex*, llvm::ModuleSummaryIndex const*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>> const&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10108acec)
#20 0x00000001011bb944 llvm::lto::backend(llvm::lto::Config const&, std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>, unsigned int, llvm::Module&, llvm::ModuleSummaryIndex&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10108b944)
#21 0x00000001011a4ccc llvm::lto::LTO::runRegularLTO(std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101074ccc)
#22 0x00000001011a4040 llvm::lto::LTO::run(std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>, llvm::FileCache) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101074040)
#23 0x00000001001fba34 lld::elf::BitcodeCompiler::compile() (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x1000cba34)
#24 0x0000000100187cc8 void lld::elf::LinkerDriver::compileBitcodeFiles<llvm::object::ELFType<(llvm::endianness)1, true>>(bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100057cc8)
#25 0x000000010016aac4 void lld::elf::LinkerDriver::link<llvm::object::ELFType<(llvm::endianness)1, true>>(llvm::opt::InputArgList&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10003aac4)
#26 0x0000000100157784 lld::elf::LinkerDriver::linkerMain(llvm::ArrayRef<char const*>) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100027784)
#27 0x0000000100154968 lld::elf::link(llvm::ArrayRef<char const*>, llvm::raw_ostream&, llvm::raw_ostream&, bool, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100024968)
#28 0x00000001005070e0 lld::unsafeLldMain(llvm::ArrayRef<char const*>, llvm::raw_ostream&, llvm::raw_ostream&, llvm::ArrayRef<lld::DriverDef>, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x1003d70e0)
#29 0x0000000100131ee0 lld_main(int, char**, llvm::ToolContext const&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100001ee0)
#30 0x0000000100132624 main (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100002624)
#31 0x000000018d1950e0

This is with an LTO build at -O3 on rva23u64

HanKuanChen · 2024-12-06T08:24:52Z

Hi, I'm getting a crash after this when building 510.parest_r from SPEC CPU 2017:

Assertion failed: (isa<To>(Val) && "cast<Ty>() argument of incompatible type!"), function cast, file Casting.h, line 578.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: /Users/luke/Developer/llvm-project/build.release/bin/ld.lld --sysroot=/Users/luke/Developer/riscv64-linux-gnu --hash-style=gnu --eh-frame-hdr -m elf64lriscv -X -pie -dynamic-linker /lib/ld-linux-riscv64-lp64d.so.1 ... -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /Users/luke/Developer/riscv64-linux-gnu/lib/crtendS.o /Users/luke/Developer/riscv64-linux-gnu/lib/crtn.o -march=rva23u64 -O3
1.	Running pass "function<eager-inv>(loop-mssa(licm<allowspeculation>),gvn<>,memcpyopt,dse,move-auto-init,mldst-motion<no-split-footer-bb>,loop(indvars,loop-deletion,loop-unroll-full),loop-distribute,loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,infer-alignment,loop-unroll<O3>,transform-warning,sroa<preserve-cfg>,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sccp,instcombine<max-iterations=1;no-verify-fixpoint>,bdce,slp-vectorizer,vector-combine,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,jump-threading)" on module "ld-temp.o"
2.	Running pass "slp-vectorizer" on function "_ZNK6dealii7FE_PolyINS_24TensorProductPolynomialsILi3EEELi3ELi3EE8get_dataENS_11UpdateFlagsERKNS_7MappingILi3ELi3EEERKNS_10QuadratureILi3EEE"
 #0 0x0000000102f62368 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e32368)
 #1 0x0000000102f60330 llvm::sys::RunSignalHandlers() (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e30330)
 #2 0x0000000102f62a0c SignalHandler(int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e32a0c)
 #3 0x000000018d545a24 (/usr/lib/system/libsystem_platform.dylib+0x18046da24)
 #4 0x000000018d515cc0 (/usr/lib/system/libsystem_pthread.dylib+0x18043dcc0)
 #5 0x000000018d421a40 (/usr/lib/system/libsystem_c.dylib+0x180349a40)
 #6 0x000000018d420d30 (/usr/lib/system/libsystem_c.dylib+0x180348d30)
 #7 0x0000000101c99414 llvm::slpvectorizer::BoUpSLP::TreeEntry::setOperand(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP const&, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b69414)
 #8 0x0000000101c8de6c llvm::slpvectorizer::BoUpSLP::buildTree_rec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b5de6c)
 #9 0x0000000101c8f4a4 llvm::slpvectorizer::BoUpSLP::buildTree_rec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b5f4a4)
#10 0x0000000101cd0a68 llvm::SLPVectorizerPass::vectorizeStoreChain(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP&, unsigned int, unsigned int, unsigned int&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba0a68)
#11 0x0000000101cd2ba4 llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::$_0::operator()(std::__1::set<std::__1::pair<unsigned int, int>, llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::StoreDistCompare, std::__1::allocator<std::__1::pair<unsigned int, int>>> const&) const (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba2ba4)
#12 0x0000000101cd1d7c llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba1d7c)
#13 0x0000000101ccdcb8 llvm::SLPVectorizerPass::vectorizeStoreChains(llvm::slpvectorizer::BoUpSLP&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9dcb8)
#14 0x0000000101cccc1c llvm::SLPVectorizerPass::runImpl(llvm::Function&, llvm::ScalarEvolution*, llvm::TargetTransformInfo*, llvm::TargetLibraryInfo*, llvm::AAResults*, llvm::LoopInfo*, llvm::DominatorTree*, llvm::AssumptionCache*, llvm::DemandedBits*, llvm::OptimizationRemarkEmitter*) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9cc1c)
#15 0x0000000101ccc298 llvm::SLPVectorizerPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9c298)
#16 0x0000000102d9fe78 llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c6fe78)
#17 0x0000000102da42e8 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c742e8)
#18 0x0000000102d9f0a4 llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c6f0a4)
#19 0x00000001011bacec llvm::lto::opt(llvm::lto::Config const&, llvm::TargetMachine*, unsigned int, llvm::Module&, bool, llvm::ModuleSummaryIndex*, llvm::ModuleSummaryIndex const*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>> const&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10108acec)
#20 0x00000001011bb944 llvm::lto::backend(llvm::lto::Config const&, std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>, unsigned int, llvm::Module&, llvm::ModuleSummaryIndex&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10108b944)
#21 0x00000001011a4ccc llvm::lto::LTO::runRegularLTO(std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101074ccc)
#22 0x00000001011a4040 llvm::lto::LTO::run(std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>, llvm::FileCache) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101074040)
#23 0x00000001001fba34 lld::elf::BitcodeCompiler::compile() (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x1000cba34)
#24 0x0000000100187cc8 void lld::elf::LinkerDriver::compileBitcodeFiles<llvm::object::ELFType<(llvm::endianness)1, true>>(bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100057cc8)
#25 0x000000010016aac4 void lld::elf::LinkerDriver::link<llvm::object::ELFType<(llvm::endianness)1, true>>(llvm::opt::InputArgList&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10003aac4)
#26 0x0000000100157784 lld::elf::LinkerDriver::linkerMain(llvm::ArrayRef<char const*>) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100027784)
#27 0x0000000100154968 lld::elf::link(llvm::ArrayRef<char const*>, llvm::raw_ostream&, llvm::raw_ostream&, bool, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100024968)
#28 0x00000001005070e0 lld::unsafeLldMain(llvm::ArrayRef<char const*>, llvm::raw_ostream&, llvm::raw_ostream&, llvm::ArrayRef<lld::DriverDef>, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x1003d70e0)
#29 0x0000000100131ee0 lld_main(int, char**, llvm::ToolContext const&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100001ee0)
#30 0x0000000100132624 main (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100002624)
#31 0x000000018d1950e0

This is with an LTO build at -O3 on rva23u64

Do you have a test?

lukel97 · 2024-12-06T08:31:20Z

No, it's with an LTO build so there's no single .ll or .c file I can pick out to reduce yet. Do you have access to SPEC CPU 2017?

HanKuanChen · 2024-12-06T08:33:54Z

Cannot work with -Wl,-plugin-opt=-opt-bisect-limit=-1 and -Wl,-plugin-opt=-print-before-pass-number=?
-Wl,-plugin-opt=-print-module-scope is also useful.

lukel97 · 2024-12-06T08:51:00Z

Oh thanks, I was able to get the whole module out. Running llvm-reduce on it now.

lukel97 · 2024-12-06T09:28:22Z

target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"

define ptr @_ZNK6dealii7FE_PolyINS_24TensorProductPolynomialsILi3EEELi3ELi3EE8get_dataENS_11UpdateFlagsERKNS_7MappingILi3ELi3EEERKNS_10QuadratureILi3EEE() #0 personality ptr null {
  store double poison, ptr null, align 8
  %1 = getelementptr i8, ptr null, i64 8
  %2 = fmul double 0.000000e+00, 0.000000e+00
  store double %2, ptr %1, align 8
  %3 = getelementptr i8, ptr null, i64 16
  %4 = fmul double 0.000000e+00, 0.000000e+00
  store double %4, ptr %3, align 8
  %5 = getelementptr i8, ptr null, i64 24
  store double %2, ptr %5, align 8
  ret ptr null
}

; uselistorder directives
uselistorder ptr null, { 1, 2, 3, 4, 5, 6, 7, 0 }

attributes #0 = { "target-features"="+64bit,+a,+b,+c,+d,+experimental,+f,+m,+relax,+supm,+v,+za64rs,+zaamo,+zalrsc,+zawrs,+zba,+zbb,+zbs,+zca,+zcb,+zcmop,+zfa,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicond,+zicsr,+zihintntl,+zihintpause,+zihpm,+zimop,+zkt,+zmmul,+zvbb,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvfhmin,+zvkb,+zvkt,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqcia,-experimental-xqcicsr,-experimental-xqcisls,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-h,-sha,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zabha,-zacas,-zama16b,-zbc,-zbkb,-zbkc,-zbkx,-zcd,-zce,-zcf,-zcmp,-zcmt,-zdinx,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-zifencei,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }

Can reproduce the crash with opt -passes=slp-vectorizer

…nds. (#113880)" This reverts commit 94fbe7e. Causes a crash when linking mafft in ReleaseLTO-g config.

nikic · 2024-12-06T13:31:27Z

Reverted this because it breaks the llvm-test-suite build in ReleaseLTO-g configuration.

…vm#113880) To reduce repeated code, TreeEntry::setOperandsInOrder will be replaced by VLOperands. Arg_size will be provided to make sure other operands will not be reorderd when VL[0] is IntrinsicInst (because APO is a boolean value). In addition, BoUpSLP::reorderInputsAccordingToOpcode will also be removed since it is simple.

HanKuanChen requested a review from alexey-bataev October 28, 2024 08:20

llvmbot added vectorizers llvm:transforms labels Oct 28, 2024

alexey-bataev reviewed Oct 28, 2024

View reviewed changes

HanKuanChen requested a review from alexey-bataev October 29, 2024 15:01

alexey-bataev reviewed Oct 29, 2024

View reviewed changes

alexey-bataev reviewed Oct 31, 2024

View reviewed changes

HanKuanChen force-pushed the slp-refactor-vloperands branch from 13fa3d1 to 87b6e09 Compare November 19, 2024 13:02

HanKuanChen requested a review from alexey-bataev November 19, 2024 13:08

alexey-bataev reviewed Nov 19, 2024

View reviewed changes

HanKuanChen requested a review from alexey-bataev November 21, 2024 12:56

alexey-bataev reviewed Nov 21, 2024

View reviewed changes

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Outdated Show resolved Hide resolved

alexey-bataev reviewed Nov 21, 2024

View reviewed changes

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Outdated Show resolved Hide resolved

alexey-bataev reviewed Nov 22, 2024

View reviewed changes

alexey-bataev reviewed Nov 26, 2024

View reviewed changes

HanKuanChen force-pushed the slp-refactor-vloperands branch from 4e9392e to b4a9778 Compare December 4, 2024 06:58

HanKuanChen requested a review from alexey-bataev December 5, 2024 02:31

alexey-bataev approved these changes Dec 5, 2024

View reviewed changes

HanKuanChen merged commit 94fbe7e into llvm:main Dec 6, 2024
8 checks passed

HanKuanChen deleted the slp-refactor-vloperands branch December 6, 2024 04:03

HanKuanChen mentioned this pull request Dec 6, 2024

[SLP] Fix PoisonValue in the argument VL of setOperand. #118949

Merged

nikic added a commit that referenced this pull request Dec 6, 2024

Revert "[SLP] NFC. Replace TreeEntry::setOperandsInOrder with VLOpera…

6307e4b

…nds. (#113880)" This reverts commit 94fbe7e. Causes a crash when linking mafft in ReleaseLTO-g config.

		if (isa<BinaryOperator>(VL0) && isCommutative(VL0))
		Ops.reorder();

[SLP] NFC. Replace TreeEntry::setOperandsInOrder with VLOperands. #113880

[SLP] NFC. Replace TreeEntry::setOperandsInOrder with VLOperands. #113880

Uh oh!

Conversation

HanKuanChen commented Oct 28, 2024

Uh oh!

llvmbot commented Oct 28, 2024

Uh oh!

llvmbot commented Oct 28, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

alexey-bataev commented Nov 28, 2024

Uh oh!

HanKuanChen commented Nov 29, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

alexey-bataev commented Nov 29, 2024

Uh oh!

HanKuanChen commented Dec 3, 2024

Uh oh!

alexey-bataev left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Dec 6, 2024

Uh oh!

lukel97 commented Dec 6, 2024

Uh oh!

HanKuanChen commented Dec 6, 2024

Uh oh!

lukel97 commented Dec 6, 2024

HanKuanChen commented Nov 29, 2024 •

edited

Loading