Skip to content

[SLP] NFC. Replace TreeEntry::setOperandsInOrder with VLOperands. #113880

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 6, 2024

Conversation

HanKuanChen
Copy link
Contributor

To reduce repeated code, TreeEntry::setOperandsInOrder will be replaced by VLOperands.
Arg_size will be provided to make sure other operands will not be reorderd when VL[0] is IntrinsicInst (because APO is a boolean value).
In addition, BoUpSLP::reorderInputsAccordingToOpcode will also be removed since it is simple.

@llvmbot
Copy link
Member

llvmbot commented Oct 28, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Han-Kuan Chen (HanKuanChen)

Changes

To reduce repeated code, TreeEntry::setOperandsInOrder will be replaced by VLOperands.
Arg_size will be provided to make sure other operands will not be reorderd when VL[0] is IntrinsicInst (because APO is a boolean value).
In addition, BoUpSLP::reorderInputsAccordingToOpcode will also be removed since it is simple.


Full diff: https://github.com/llvm/llvm-project/pull/113880.diff

1 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+51-108)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2afd02dae3a8b8..14220efb65a195 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1948,6 +1948,9 @@ class BoUpSLP {
 
     /// A vector of operand vectors.
     SmallVector<OperandDataVec, 4> OpsVec;
+    /// When VL[0] is IntrinsicInst, Arg_size is CallBase::arg_size. When VL[0]
+    /// is not IntrinsicInst, Arg_size is User::getNumOperands.
+    unsigned Arg_size;
 
     const TargetLibraryInfo &TLI;
     const DataLayout &DL;
@@ -2337,10 +2340,11 @@ class BoUpSLP {
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
       assert(isa<Instruction>(VL[0]) && "Expected instruction");
+      unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+      // IntrinsicInst::isCommutative returns true if swapping the first "two"
+      // arguments to the intrinsic produces the same result.
       constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = isa<IntrinsicInst>(VL[0])
-                                 ? IntrinsicNumOperands
-                                 : cast<Instruction>(VL[0])->getNumOperands();
+      Arg_size = isa<IntrinsicInst>(VL[0]) ? IntrinsicNumOperands : NumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -2366,7 +2370,7 @@ class BoUpSLP {
     }
 
     /// \returns the number of operands.
-    unsigned getNumOperands() const { return OpsVec.size(); }
+    unsigned getNumOperands() const { return Arg_size; }
 
     /// \returns the number of lanes.
     unsigned getNumLanes() const { return OpsVec[0].size(); }
@@ -3059,13 +3063,6 @@ class BoUpSLP {
                            SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
                            8> &GatheredLoads);
 
-  /// Reorder commutative or alt operands to get better probability of
-  /// generating vectorized code.
-  static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const BoUpSLP &R);
-
   /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
   /// users of \p TE and collects the stores. It returns the map from the store
   /// pointers to the collected stores.
@@ -3256,24 +3253,6 @@ class BoUpSLP {
       copy(OpVL, Operands[OpIdx].begin());
     }
 
-    /// Set the operands of this bundle in their original order.
-    void setOperandsInOrder() {
-      assert(Operands.empty() && "Already initialized?");
-      auto *I0 = cast<Instruction>(Scalars[0]);
-      Operands.resize(I0->getNumOperands());
-      unsigned NumLanes = Scalars.size();
-      for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
-           OpIdx != NumOperands; ++OpIdx) {
-        Operands[OpIdx].resize(NumLanes);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          auto *I = cast<Instruction>(Scalars[Lane]);
-          assert(I->getNumOperands() == NumOperands &&
-                 "Expected same number of operands");
-          Operands[OpIdx][Lane] = I->getOperand(OpIdx);
-        }
-      }
-    }
-
     /// Reorders operands of the node to the given mask \p Mask.
     void reorderOperands(ArrayRef<int> Mask) {
       for (ValueList &Operand : Operands)
@@ -8294,7 +8273,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    {}, CurrentOrder);
       LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
 
-      TE->setOperandsInOrder();
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
       buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
       return;
     }
@@ -8315,27 +8296,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         else
           LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
-        TE->setOperandsInOrder();
         break;
       case TreeEntry::StridedVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
-        TE->setOperandsInOrder();
         LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
         break;
       case TreeEntry::ScatterVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices);
-        TE->setOperandsInOrder();
-        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
         break;
       case TreeEntry::CombinedVectorize:
       case TreeEntry::NeedToGather:
         llvm_unreachable("Unexpected loads state.");
       }
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      if (State == TreeEntry::ScatterVectorize)
+        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
       return;
     }
     case Instruction::ZExt:
@@ -8373,8 +8355,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       if (ShuffleOrOp == Instruction::Trunc) {
         ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
@@ -8401,12 +8385,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       ValueList Left, Right;
+      VLOperands Ops(VL, *this);
       if (cast<CmpInst>(VL0)->isCommutative()) {
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
         assert(P0 == CmpInst::getSwappedPredicate(P0) &&
                "Commutative Predicate mismatch");
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+        Ops.reorder();
+        Left = Ops.getVL(0);
+        Right = Ops.getVL(1);
       } else {
         // Collect operands - commute if it uses the swapped predicate.
         for (Value *V : VL) {
@@ -8462,20 +8449,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
 
+      VLOperands Ops(VL, *this);
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
-      if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
-        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        return;
-      }
-
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      if (isa<BinaryOperator>(VL0) && isCommutative(VL0))
+        Ops.reorder();
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
@@ -8540,7 +8521,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         fixupOrderingIndices(CurrentOrder);
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndices, CurrentOrder);
-      TE->setOperandsInOrder();
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
       buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
       if (Consecutive)
         LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
@@ -8556,46 +8539,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndices);
+      VLOperands Ops(VL, *this);
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
-      if (isCommutative(VL0)) {
-        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        SmallVector<ValueList> Operands;
-        for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
-          Operands.emplace_back();
-          if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
-            continue;
-          for (Value *V : VL) {
-            auto *CI2 = cast<CallInst>(V);
-            Operands.back().push_back(CI2->getArgOperand(I));
-          }
-          TE->setOperand(I, Operands.back());
-        }
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
-          if (Operands[I - 2].empty())
-            continue;
-          buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
-        }
-        return;
-      }
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
+      if (isCommutative(VL0))
+        Ops.reorder();
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(CI->arg_size())) {
         // For scalar operands no need to create an entry since no need to
         // vectorize it.
         if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
           continue;
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (Value *V : VL) {
-          auto *CI2 = cast<CallInst>(V);
-          Operands.push_back(CI2->getArgOperand(I));
-        }
-        buildTree_rec(Operands, Depth + 1, {TE, I});
+        buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       }
       return;
     }
@@ -8604,14 +8560,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
+      VLOperands Ops(VL, *this);
       // Reorder operands if reordering would enable vectorization.
       auto *CI = dyn_cast<CmpInst>(VL0);
       if (isa<BinaryOperator>(VL0) || CI) {
-        ValueList Left, Right;
         if (!CI || all_of(VL, [](Value *V) {
               return cast<CmpInst>(V)->isCommutative();
             })) {
-          reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+          Ops.reorder();
         } else {
           auto *MainCI = cast<CmpInst>(S.MainOp);
           auto *AltCI = cast<CmpInst>(S.AltOp);
@@ -8619,6 +8575,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           CmpInst::Predicate AltP = AltCI->getPredicate();
           assert(MainP != AltP &&
                  "Expected different main/alternate predicates.");
+          ValueList Left, Right;
           // Collect operands - commute if it uses the swapped predicate or
           // alternate operation.
           for (Value *V : VL) {
@@ -8636,16 +8593,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             Left.push_back(LHS);
             Right.push_back(RHS);
           }
+          TE->setOperand(0, Left);
+          TE->setOperand(1, Right);
+          buildTree_rec(Left, Depth + 1, {TE, 0});
+          buildTree_rec(Right, Depth + 1, {TE, 1});
+          return;
         }
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        return;
       }
 
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
@@ -13024,21 +12982,6 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
   return Cost;
 }
 
-// Perform operand reordering on the instructions in VL and return the reordered
-// operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const BoUpSLP &R) {
-  if (VL.empty())
-    return;
-  VLOperands Ops(VL, R);
-  // Reorder the operands in place.
-  Ops.reorder();
-  Left = Ops.getVL(0);
-  Right = Ops.getVL(1);
-}
-
 Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
   auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
   if (Res)

@llvmbot
Copy link
Member

llvmbot commented Oct 28, 2024

@llvm/pr-subscribers-vectorizers

Author: Han-Kuan Chen (HanKuanChen)

Changes

To reduce repeated code, TreeEntry::setOperandsInOrder will be replaced by VLOperands.
Arg_size will be provided to make sure other operands will not be reorderd when VL[0] is IntrinsicInst (because APO is a boolean value).
In addition, BoUpSLP::reorderInputsAccordingToOpcode will also be removed since it is simple.


Full diff: https://github.com/llvm/llvm-project/pull/113880.diff

1 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+51-108)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2afd02dae3a8b8..14220efb65a195 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1948,6 +1948,9 @@ class BoUpSLP {
 
     /// A vector of operand vectors.
     SmallVector<OperandDataVec, 4> OpsVec;
+    /// When VL[0] is IntrinsicInst, Arg_size is CallBase::arg_size. When VL[0]
+    /// is not IntrinsicInst, Arg_size is User::getNumOperands.
+    unsigned Arg_size;
 
     const TargetLibraryInfo &TLI;
     const DataLayout &DL;
@@ -2337,10 +2340,11 @@ class BoUpSLP {
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
       assert(isa<Instruction>(VL[0]) && "Expected instruction");
+      unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+      // IntrinsicInst::isCommutative returns true if swapping the first "two"
+      // arguments to the intrinsic produces the same result.
       constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = isa<IntrinsicInst>(VL[0])
-                                 ? IntrinsicNumOperands
-                                 : cast<Instruction>(VL[0])->getNumOperands();
+      Arg_size = isa<IntrinsicInst>(VL[0]) ? IntrinsicNumOperands : NumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -2366,7 +2370,7 @@ class BoUpSLP {
     }
 
     /// \returns the number of operands.
-    unsigned getNumOperands() const { return OpsVec.size(); }
+    unsigned getNumOperands() const { return Arg_size; }
 
     /// \returns the number of lanes.
     unsigned getNumLanes() const { return OpsVec[0].size(); }
@@ -3059,13 +3063,6 @@ class BoUpSLP {
                            SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
                            8> &GatheredLoads);
 
-  /// Reorder commutative or alt operands to get better probability of
-  /// generating vectorized code.
-  static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const BoUpSLP &R);
-
   /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
   /// users of \p TE and collects the stores. It returns the map from the store
   /// pointers to the collected stores.
@@ -3256,24 +3253,6 @@ class BoUpSLP {
       copy(OpVL, Operands[OpIdx].begin());
     }
 
-    /// Set the operands of this bundle in their original order.
-    void setOperandsInOrder() {
-      assert(Operands.empty() && "Already initialized?");
-      auto *I0 = cast<Instruction>(Scalars[0]);
-      Operands.resize(I0->getNumOperands());
-      unsigned NumLanes = Scalars.size();
-      for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
-           OpIdx != NumOperands; ++OpIdx) {
-        Operands[OpIdx].resize(NumLanes);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          auto *I = cast<Instruction>(Scalars[Lane]);
-          assert(I->getNumOperands() == NumOperands &&
-                 "Expected same number of operands");
-          Operands[OpIdx][Lane] = I->getOperand(OpIdx);
-        }
-      }
-    }
-
     /// Reorders operands of the node to the given mask \p Mask.
     void reorderOperands(ArrayRef<int> Mask) {
       for (ValueList &Operand : Operands)
@@ -8294,7 +8273,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    {}, CurrentOrder);
       LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
 
-      TE->setOperandsInOrder();
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
       buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
       return;
     }
@@ -8315,27 +8296,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         else
           LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
-        TE->setOperandsInOrder();
         break;
       case TreeEntry::StridedVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
-        TE->setOperandsInOrder();
         LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
         break;
       case TreeEntry::ScatterVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices);
-        TE->setOperandsInOrder();
-        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
         break;
       case TreeEntry::CombinedVectorize:
       case TreeEntry::NeedToGather:
         llvm_unreachable("Unexpected loads state.");
       }
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      if (State == TreeEntry::ScatterVectorize)
+        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
       return;
     }
     case Instruction::ZExt:
@@ -8373,8 +8355,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       if (ShuffleOrOp == Instruction::Trunc) {
         ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
@@ -8401,12 +8385,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       ValueList Left, Right;
+      VLOperands Ops(VL, *this);
       if (cast<CmpInst>(VL0)->isCommutative()) {
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
         assert(P0 == CmpInst::getSwappedPredicate(P0) &&
                "Commutative Predicate mismatch");
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+        Ops.reorder();
+        Left = Ops.getVL(0);
+        Right = Ops.getVL(1);
       } else {
         // Collect operands - commute if it uses the swapped predicate.
         for (Value *V : VL) {
@@ -8462,20 +8449,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
 
+      VLOperands Ops(VL, *this);
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
-      if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
-        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        return;
-      }
-
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      if (isa<BinaryOperator>(VL0) && isCommutative(VL0))
+        Ops.reorder();
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
@@ -8540,7 +8521,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         fixupOrderingIndices(CurrentOrder);
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndices, CurrentOrder);
-      TE->setOperandsInOrder();
+      VLOperands Ops(VL, *this);
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
       buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
       if (Consecutive)
         LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
@@ -8556,46 +8539,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndices);
+      VLOperands Ops(VL, *this);
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
-      if (isCommutative(VL0)) {
-        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        SmallVector<ValueList> Operands;
-        for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
-          Operands.emplace_back();
-          if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
-            continue;
-          for (Value *V : VL) {
-            auto *CI2 = cast<CallInst>(V);
-            Operands.back().push_back(CI2->getArgOperand(I));
-          }
-          TE->setOperand(I, Operands.back());
-        }
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
-          if (Operands[I - 2].empty())
-            continue;
-          buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
-        }
-        return;
-      }
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
+      if (isCommutative(VL0))
+        Ops.reorder();
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(CI->arg_size())) {
         // For scalar operands no need to create an entry since no need to
         // vectorize it.
         if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
           continue;
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (Value *V : VL) {
-          auto *CI2 = cast<CallInst>(V);
-          Operands.push_back(CI2->getArgOperand(I));
-        }
-        buildTree_rec(Operands, Depth + 1, {TE, I});
+        buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       }
       return;
     }
@@ -8604,14 +8560,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
+      VLOperands Ops(VL, *this);
       // Reorder operands if reordering would enable vectorization.
       auto *CI = dyn_cast<CmpInst>(VL0);
       if (isa<BinaryOperator>(VL0) || CI) {
-        ValueList Left, Right;
         if (!CI || all_of(VL, [](Value *V) {
               return cast<CmpInst>(V)->isCommutative();
             })) {
-          reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+          Ops.reorder();
         } else {
           auto *MainCI = cast<CmpInst>(S.MainOp);
           auto *AltCI = cast<CmpInst>(S.AltOp);
@@ -8619,6 +8575,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           CmpInst::Predicate AltP = AltCI->getPredicate();
           assert(MainP != AltP &&
                  "Expected different main/alternate predicates.");
+          ValueList Left, Right;
           // Collect operands - commute if it uses the swapped predicate or
           // alternate operation.
           for (Value *V : VL) {
@@ -8636,16 +8593,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             Left.push_back(LHS);
             Right.push_back(RHS);
           }
+          TE->setOperand(0, Left);
+          TE->setOperand(1, Right);
+          buildTree_rec(Left, Depth + 1, {TE, 0});
+          buildTree_rec(Right, Depth + 1, {TE, 1});
+          return;
         }
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        return;
       }
 
-      TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+        TE->setOperand(I, Ops.getVL(I));
+      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
@@ -13024,21 +12982,6 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
   return Cost;
 }
 
-// Perform operand reordering on the instructions in VL and return the reordered
-// operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const BoUpSLP &R) {
-  if (VL.empty())
-    return;
-  VLOperands Ops(VL, R);
-  // Reorder the operands in place.
-  Ops.reorder();
-  Left = Ops.getVL(0);
-  Right = Ops.getVL(1);
-}
-
 Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
   auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
   if (Res)

@@ -1948,6 +1948,9 @@ class BoUpSLP {

/// A vector of operand vectors.
SmallVector<OperandDataVec, 4> OpsVec;
/// When VL[0] is IntrinsicInst, Arg_size is CallBase::arg_size. When VL[0]
/// is not IntrinsicInst, Arg_size is User::getNumOperands.
unsigned Arg_size;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the original code cannot be reused, why need an extra data member here? Why just getNumOperands() or 2 does not work?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For IntrinsicInst with more than 2 operands, VLOperands::appendOperandsOfVL stores the first two operands but TreeEntry::setOperandsInOrder stores all of the operands.
Only the first two operands should be reorder. But all of the operands can be passed into buildTree_rec.
If we don't use a new data member (Arg_size), either reorder swap non-swappable operands or we need to keep TreeEntry::setOperandsInOrder.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just mark such operands as non-reordable

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right. We can use 2 here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just mark such operands as non-reordable

Mark operands as non-reordable also requires additional data member to store the information.

Comment on lines 2366 to 2369
/// reorder can only work on instructions with 2 operands. If we use
/// OpsVec.size() here, reorder will swap non-swappable operands (because
/// APO is a boolean value).
unsigned getNumOperands() const { return 2; }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like this approach. I plan to extend the reordering to support many operands, better to avoid constant value here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then we still need arg_size here.
On the other hand, I am curious about how reorder support more operands. Only the first two operands of BinaryOperator, CmpInst and IntrinsicInst are commutative. What kinds of instructions can be supported if we enhance reorder?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you have add i32 (add p1, p2), (add p3, p4), we can model it as add p1, p2, p3, p4 and reorder operands.

unsigned NumOperands = isa<IntrinsicInst>(VL[0])
? IntrinsicNumOperands
: cast<Instruction>(VL[0])->getNumOperands();
Arg_size = isa<IntrinsicInst>(VL[0]) ? IntrinsicNumOperands : NumOperands;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still do not understand why we need Arg_size here. BTW, Better it use NumOperands or ArgSize name

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need ArgSize to make sure reorder only swap the first two operands. After this PR, OpsVec also contains non swappable operands (because we want appendOperandsOfVL can be reused even the operands of VL is not commutative).

Comment on lines 8494 to 8495
if (isa<BinaryOperator>(VL0) && isCommutative(VL0))
Ops.reorder();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better hide it in setOperand() function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it is a good idea. Current reorder usage is

      if (cast<CmpInst>(VL0)->isCommutative()) {
        // Commutative predicate - collect + sort operands of the instructions
        // so that each side is more likely to have the same opcode.
        assert(P0 == CmpInst::getSwappedPredicate(P0) &&
               "Commutative Predicate mismatch");
        Ops.reorder();
      if (isa<BinaryOperator>(VL0) && isCommutative(VL0))
        Ops.reorder();
      if (isCommutative(VL0))
        Ops.reorder();
      auto *CI = dyn_cast<CmpInst>(VL0);
      if (isa<BinaryOperator>(VL0) || CI) {
        if (!CI || all_of(VL, [](Value *V) {
              return cast<CmpInst>(V)->isCommutative();
            })) {
          Ops.reorder();

They use reorder when VL0 is commutative. But the last one determine the condition in another way.
Move reorder into other place does not make the code neater (the last one still exist).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pass the flag to do the reordering or not

auto *CI2 = cast<CallInst>(V);
Operands.back().push_back(CI2->getArgOperand(I));
}
TE->setOperand(I, Operands.back());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about these operands, >= 2?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean? They are still in TE. appendOperandsOfVL did not forget them.

Comment on lines -8604 to -8740
if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
continue;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no such check in the new code in setOperand. What if one of the 2 first operands is isVectorIntrinsicWithScalarOpAtArg()?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
It starts at 2.

The only difference is the new code will store it.
The old code for other arguments which is isVectorIntrinsicWithScalarOpAtArg, setOperand will not store it.
The new code for other arguments which is isVectorIntrinsicWithScalarOpAtArg, setOperand will store it.

But both the old code and the new code will not execute buildTree_rec if the argument is isVectorIntrinsicWithScalarOpAtArg.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to avoid calling setOperand for isVectorIntrinsicWithScalarOpAtArg args

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But for IntrinsicInst which is NOT commutative, all arguments will be stored into a TreeEntry (and SLP will not call buildTree_rec for those arguments).

If we have to "avoid calling setOperand for isVectorIntrinsicWithScalarOpAtArg args" for commutative IntrinsicInst, we have to do something like the following patch.

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d8c596eb875d..54210cc0b149 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3267,8 +3267,18 @@ private:
     void setOperand(ArrayRef<Value *> VL, const BoUpSLP &R,
                     bool RequireReorder = false) {
       VLOperands Ops(VL, R);
-      if (RequireReorder)
+      if (RequireReorder) {
         Ops.reorder();
+        if (auto *CI = dyn_cast<CallInst>(VL[0])) {
+          Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, R.TLI);
+          for (unsigned I : seq<unsigned>(CI->arg_size())) {
+            if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
+              continue;
+            setOperand(I, Ops.getVL(I));
+          }
+          return;
+        }
+      }
       for (unsigned I :
            seq<unsigned>(cast<Instruction>(VL[0])->getNumOperands()))
         setOperand(I, Ops.getVL(I));

But it does not look elegant.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But for IntrinsicInst which is NOT commutative, all arguments will be stored into a TreeEntry (and SLP will not call buildTree_rec for those arguments).

If we have to "avoid calling setOperand for isVectorIntrinsicWithScalarOpAtArg args" for commutative IntrinsicInst, we have to do something like the following patch.

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d8c596eb875d..54210cc0b149 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3267,8 +3267,18 @@ private:
     void setOperand(ArrayRef<Value *> VL, const BoUpSLP &R,
                     bool RequireReorder = false) {
       VLOperands Ops(VL, R);
-      if (RequireReorder)
+      if (RequireReorder) {
         Ops.reorder();
+        if (auto *CI = dyn_cast<CallInst>(VL[0])) {
+          Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, R.TLI);
+          for (unsigned I : seq<unsigned>(CI->arg_size())) {
+            if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
+              continue;
+            setOperand(I, Ops.getVL(I));
+          }
+          return;
+        }
+      }
       for (unsigned I :
            seq<unsigned>(cast<Instruction>(VL[0])->getNumOperands()))
         setOperand(I, Ops.getVL(I));

But it does not look elegant.

We should maintain correctness where possible and should not produce "noise" entries in the graph. They may cause unexpecteв side effects and cause some extra compiler crashes

Ops.reorder();
if (auto *CI = dyn_cast<CallInst>(VL[0])) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, R.TLI);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to add a check that ID is actually an intrinsic and avoid this check for non-intrinsics

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, why this processing is enabled only if RequireReorder is true?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to add a check that ID is actually an intrinsic and avoid this check for non-intrinsics

Why do we need to check the ID while the old code does not have to?

Also, why this processing is enabled only if RequireReorder is true?

Because only the commutative intrinsics check it in the old code.

I think we should NOT add isVectorIntrinsicWithScalarOpAtArg check for setOperand (for commutative and non commutative intrinsics). The reason is it will cause some bugs. I will use llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll as an example here.
If we make non commutative intrinsics check isVectorIntrinsicWithScalarOpAtArg, then

  1. %add1 will not be added into TreeEntry (because the operand 1 for powi is scalar operand).
  2. While SLP is trying to do schedule, SLP cannot call DecrUnsched because the operand of llvm.powi is not inside the TreeEntry (it is isVectorIntrinsicWithScalarOpAtArg).
          for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
               OpIdx != NumOperands; ++OpIdx)
            if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
              DecrUnsched(I);
  1. Eventually it triggers the assert
#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
  // Check that all schedulable entities got scheduled
  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
    ScheduleData *SD = BS->getScheduleData(I);
    if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
      assert(SD->IsScheduled && "must be scheduled at this point");
  }
#endif

Only smul_fix, smul_fix_sat, umul_fix and umul_fix_sat are non commutative intrinsic and also isVectorIntrinsicWithScalarOpAtArg. However, the third operand of intrinsics are Constant, which will not trigger the assert.

I think we should revert 664f2d3.

@alexey-bataev
Copy link
Member

Why do we need to check the ID while the old code does not have to?

Because the load code explicitly worked only for CallInst

Because only the commutative intrinsics check it in the old code.

The non-commutative intrinsics also need to have a check for isVectorIntrinsicWithScalarOpAtArg

I think we should NOT add isVectorIntrinsicWithScalarOpAtArg check for setOperand

The check should be there or this whole patch should be dropped. We should not create pseudo tree node for the scalar instructions

@HanKuanChen
Copy link
Contributor Author

HanKuanChen commented Nov 29, 2024

Why do we need to check the ID while the old code does not have to?

Because the load code explicitly worked only for CallInst

But we have a if (auto *CI = dyn_cast<CallInst>(VL[0])) { check here.

Because only the commutative intrinsics check it in the old code.

The non-commutative intrinsics also need to have a check for isVectorIntrinsicWithScalarOpAtArg

If we add isVectorIntrinsicWithScalarOpAtArg check for non-commutative intrinsics, llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll will fail even without this PR. The bug is not caused by this PR.

I think we should NOT add isVectorIntrinsicWithScalarOpAtArg check for setOperand

The check should be there or this whole patch should be dropped. We should not create pseudo tree node for the scalar instructions

If LLVM has an intrinsic in the future and

  • the intrinsic is commutative
  • one of the operand is isVectorIntrinsicWithScalarOpAtArg, and the operand is NOT Constant

SLP will get fail because the operand is NOT in TreeEntry and it will trigger the assert in

#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
  // Check that all schedulable entities got scheduled
  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
    ScheduleData *SD = BS->getScheduleData(I);
    if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
      assert(SD->IsScheduled && "must be scheduled at this point");
  }
#endif

Have 664f2d3 will cause a potential bug in the future.

In addition, the current code already has scalar instructions in TreeEntry because non-commutative intrinsics use setOperandsInOrder (and setOperandsInOrder does not have isVectorIntrinsicWithScalarOpAtArg check).

@alexey-bataev
Copy link
Member

Why do we need to check the ID while the old code does not have to?

Because the load code explicitly worked only for CallInst

But we have a if (auto *CI = dyn_cast<CallInst>(VL[0])) { check here.

Because only the commutative intrinsics check it in the old code.

The non-commutative intrinsics also need to have a check for isVectorIntrinsicWithScalarOpAtArg

If we add isVectorIntrinsicWithScalarOpAtArg check for non-commutative intrinsics, llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll will fail even without this PR. The bug is not caused by this PR.

I think we should NOT add isVectorIntrinsicWithScalarOpAtArg check for setOperand

The check should be there or this whole patch should be dropped. We should not create pseudo tree node for the scalar instructions

If LLVM has an intrinsic in the future and

  • the intrinsic is commutative
  • one of the operand is isVectorIntrinsicWithScalarOpAtArg, and the operand is NOT Constant

SLP will get fail because the operand is NOT in TreeEntry and it will trigger the assert in

#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
  // Check that all schedulable entities got scheduled
  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
    ScheduleData *SD = BS->getScheduleData(I);
    if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
      assert(SD->IsScheduled && "must be scheduled at this point");
  }
#endif

Have 664f2d3 will cause a potential bug in the future.

In addition, the current code already has scalar instructions in TreeEntry because non-commutative intrinsics use setOperandsInOrder (and setOperandsInOrder does not have isVectorIntrinsicWithScalarOpAtArg check).

It is not in tree, it is just set as the operand of the treeentry, but there is no treeentry for such operands. Such operands will end up as just broadcast gather treeentries and won't be scheduled anyway

@HanKuanChen
Copy link
Contributor Author

Why do we need to check the ID while the old code does not have to?

Because the load code explicitly worked only for CallInst

But we have a if (auto *CI = dyn_cast<CallInst>(VL[0])) { check here.

Because only the commutative intrinsics check it in the old code.

The non-commutative intrinsics also need to have a check for isVectorIntrinsicWithScalarOpAtArg

If we add isVectorIntrinsicWithScalarOpAtArg check for non-commutative intrinsics, llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll will fail even without this PR. The bug is not caused by this PR.

I think we should NOT add isVectorIntrinsicWithScalarOpAtArg check for setOperand

The check should be there or this whole patch should be dropped. We should not create pseudo tree node for the scalar instructions

If LLVM has an intrinsic in the future and

  • the intrinsic is commutative
  • one of the operand is isVectorIntrinsicWithScalarOpAtArg, and the operand is NOT Constant

SLP will get fail because the operand is NOT in TreeEntry and it will trigger the assert in

#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
  // Check that all schedulable entities got scheduled
  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
    ScheduleData *SD = BS->getScheduleData(I);
    if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
      assert(SD->IsScheduled && "must be scheduled at this point");
  }
#endif

Have 664f2d3 will cause a potential bug in the future.
In addition, the current code already has scalar instructions in TreeEntry because non-commutative intrinsics use setOperandsInOrder (and setOperandsInOrder does not have isVectorIntrinsicWithScalarOpAtArg check).

It is not in tree, it is just set as the operand of the treeentry, but there is no treeentry for such operands. Such operands will end up as just broadcast gather treeentries and won't be scheduled anyway

SLP will call tryScheduleBundle and calculateDependencies. The latter one will calculate the dependency of ScheduleData.

      // Handle def-use chain dependencies.
      for (User *U : BundleMember->Inst->users()) {
        if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
          BundleMember->Dependencies++;
          ScheduleData *DestBundle = UseSD->FirstInBundle;
          if (!DestBundle->IsScheduled)
            BundleMember->incrementUnscheduledDeps(1);
          if (!DestBundle->hasValidDependencies())
            WorkList.push_back(DestBundle);
        }
      }

After that, it will call scheduleBlock, BS->resetSchedule() (it will update ScheduleData.UnscheduledDeps to ScheduleData.Dependencies) and do

  // Do the "real" scheduling.
  while (!ReadyInsts.empty()) {
    ScheduleData *Picked = *ReadyInsts.begin();
    ReadyInsts.erase(ReadyInsts.begin());

    // Move the scheduled instruction(s) to their dedicated places, if not
    // there yet.
    for (ScheduleData *BundleMember = Picked; BundleMember;
         BundleMember = BundleMember->NextInBundle) {
      Instruction *PickedInst = BundleMember->Inst;
      if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
        PickedInst->moveAfter(LastScheduledInst->getPrevNode());
      LastScheduledInst = PickedInst;
    }

    BS->schedule(Picked, ReadyInsts);
  }

Then the assert will be triggered because the UnscheduledDeps of %add1 is 5 (because it has 5 usage) but the second operand (@llvm.powi.f32.i32(float %fp1,i32 %add1)) is NOT in the TreeEntry (if we use isVectorIntrinsicWithScalarOpAtArg to filter that). It will make DecrUnsched cannot be called and the bundle of %add1 cannot do schedule eventually.

          for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
               OpIdx != NumOperands; ++OpIdx)
            if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
              DecrUnsched(I);

Maybe we need to modify BundleMember->Dependencies++ like this?

@@ -16921,9 +16920,14 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
       BundleMember->resetUnscheduledDeps();

       // Handle def-use chain dependencies.
-      for (User *U : BundleMember->Inst->users()) {
-        if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
-          BundleMember->Dependencies++;
+      for (Use &U : BundleMember->Inst->uses()) {
+        User *User = U.getUser();
+        if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(User))) {
+          auto *CI = dyn_cast<CallInst>(User);
+          if (!CI ||
+              !isVectorIntrinsicWithScalarOpAtArg(
+                  getVectorIntrinsicIDForCall(CI, SLP->TLI), U.getOperandNo()))
+            BundleMember->Dependencies++;
           ScheduleData *DestBundle = UseSD->FirstInBundle;
           if (!DestBundle->IsScheduled)
             BundleMember->incrementUnscheduledDeps(1);

To reduce repeated code, TreeEntry::setOperandsInOrder will be replaced
by VLOperands.
ArgSize will be provided to make sure other operands will not be
reorderd when VL[0] is IntrinsicInst (because APO is a boolean value).
In addition, BoUpSLP::reorderInputsAccordingToOpcode will also be
removed since it is simple.
Copy link
Member

@alexey-bataev alexey-bataev left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LG, please do not use force push, use stacked PRs

@HanKuanChen HanKuanChen merged commit 94fbe7e into llvm:main Dec 6, 2024
8 checks passed
@HanKuanChen HanKuanChen deleted the slp-refactor-vloperands branch December 6, 2024 04:03
@llvm-ci
Copy link
Collaborator

llvm-ci commented Dec 6, 2024

LLVM Buildbot has detected a new failure on builder openmp-offload-libc-amdgpu-runtime running on omp-vega20-1 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/9720

Here is the relevant piece of the build log for the reference
Step 7 (Add check check-offload) failure: test (failure)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: sanitizer/kernel_crash_async.c' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 2
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp    -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa -O3 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa -O3 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# RUN: at line 3
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp 2>&1 | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=TRACE
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=TRACE
# RUN: at line 4
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp 2>&1 | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=CHECK
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=CHECK
# .---command stderr------------
# | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c:39:11: error: CHECK: expected string not found in input
# | // CHECK: Kernel {{[0-9]}}: {{.*}} (__omp_offloading_{{.*}}_main_l29)
# |           ^
# | <stdin>:1:1: note: scanning from here
# | Display only launched kernel:
# | ^
# | <stdin>:2:16: note: possible intended match here
# | Kernel 'omp target in main @ 29 (__omp_offloading_802_d8283c2_main_l29)'
# |                ^
# | 
# | Input file: <stdin>
# | Check file: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             1: Display only launched kernel: 
# | check:39'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |             2: Kernel 'omp target in main @ 29 (__omp_offloading_802_d8283c2_main_l29)' 
# | check:39'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | check:39'1                    ?                                                          possible intended match
# |             3: OFFLOAD ERROR: Memory access fault by GPU 1 (agent 0x564b417a8a00) at virtual address (nil). Reasons: Page not present or supervisor privilege, Write access to a read-only page 
# | check:39'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |             4: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations 
# | check:39'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |             5: error: Aborted 
# | check:39'0     ~~~~~~~~~~~~~~~
# | >>>>>>
# `-----------------------------
# error: command failed with exit status: 1

--
...

@lukel97
Copy link
Contributor

lukel97 commented Dec 6, 2024

Hi, I'm getting a crash after this when building 510.parest_r from SPEC CPU 2017:

Assertion failed: (isa<To>(Val) && "cast<Ty>() argument of incompatible type!"), function cast, file Casting.h, line 578.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: /Users/luke/Developer/llvm-project/build.release/bin/ld.lld --sysroot=/Users/luke/Developer/riscv64-linux-gnu --hash-style=gnu --eh-frame-hdr -m elf64lriscv -X -pie -dynamic-linker /lib/ld-linux-riscv64-lp64d.so.1 ... -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /Users/luke/Developer/riscv64-linux-gnu/lib/crtendS.o /Users/luke/Developer/riscv64-linux-gnu/lib/crtn.o -march=rva23u64 -O3
1.	Running pass "function<eager-inv>(loop-mssa(licm<allowspeculation>),gvn<>,memcpyopt,dse,move-auto-init,mldst-motion<no-split-footer-bb>,loop(indvars,loop-deletion,loop-unroll-full),loop-distribute,loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,infer-alignment,loop-unroll<O3>,transform-warning,sroa<preserve-cfg>,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sccp,instcombine<max-iterations=1;no-verify-fixpoint>,bdce,slp-vectorizer,vector-combine,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,jump-threading)" on module "ld-temp.o"
2.	Running pass "slp-vectorizer" on function "_ZNK6dealii7FE_PolyINS_24TensorProductPolynomialsILi3EEELi3ELi3EE8get_dataENS_11UpdateFlagsERKNS_7MappingILi3ELi3EEERKNS_10QuadratureILi3EEE"
 #0 0x0000000102f62368 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e32368)
 #1 0x0000000102f60330 llvm::sys::RunSignalHandlers() (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e30330)
 #2 0x0000000102f62a0c SignalHandler(int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e32a0c)
 #3 0x000000018d545a24 (/usr/lib/system/libsystem_platform.dylib+0x18046da24)
 #4 0x000000018d515cc0 (/usr/lib/system/libsystem_pthread.dylib+0x18043dcc0)
 #5 0x000000018d421a40 (/usr/lib/system/libsystem_c.dylib+0x180349a40)
 #6 0x000000018d420d30 (/usr/lib/system/libsystem_c.dylib+0x180348d30)
 #7 0x0000000101c99414 llvm::slpvectorizer::BoUpSLP::TreeEntry::setOperand(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP const&, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b69414)
 #8 0x0000000101c8de6c llvm::slpvectorizer::BoUpSLP::buildTree_rec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b5de6c)
 #9 0x0000000101c8f4a4 llvm::slpvectorizer::BoUpSLP::buildTree_rec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b5f4a4)
#10 0x0000000101cd0a68 llvm::SLPVectorizerPass::vectorizeStoreChain(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP&, unsigned int, unsigned int, unsigned int&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba0a68)
#11 0x0000000101cd2ba4 llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::$_0::operator()(std::__1::set<std::__1::pair<unsigned int, int>, llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::StoreDistCompare, std::__1::allocator<std::__1::pair<unsigned int, int>>> const&) const (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba2ba4)
#12 0x0000000101cd1d7c llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba1d7c)
#13 0x0000000101ccdcb8 llvm::SLPVectorizerPass::vectorizeStoreChains(llvm::slpvectorizer::BoUpSLP&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9dcb8)
#14 0x0000000101cccc1c llvm::SLPVectorizerPass::runImpl(llvm::Function&, llvm::ScalarEvolution*, llvm::TargetTransformInfo*, llvm::TargetLibraryInfo*, llvm::AAResults*, llvm::LoopInfo*, llvm::DominatorTree*, llvm::AssumptionCache*, llvm::DemandedBits*, llvm::OptimizationRemarkEmitter*) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9cc1c)
#15 0x0000000101ccc298 llvm::SLPVectorizerPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9c298)
#16 0x0000000102d9fe78 llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c6fe78)
#17 0x0000000102da42e8 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c742e8)
#18 0x0000000102d9f0a4 llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c6f0a4)
#19 0x00000001011bacec llvm::lto::opt(llvm::lto::Config const&, llvm::TargetMachine*, unsigned int, llvm::Module&, bool, llvm::ModuleSummaryIndex*, llvm::ModuleSummaryIndex const*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>> const&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10108acec)
#20 0x00000001011bb944 llvm::lto::backend(llvm::lto::Config const&, std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>, unsigned int, llvm::Module&, llvm::ModuleSummaryIndex&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10108b944)
#21 0x00000001011a4ccc llvm::lto::LTO::runRegularLTO(std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101074ccc)
#22 0x00000001011a4040 llvm::lto::LTO::run(std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>, llvm::FileCache) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101074040)
#23 0x00000001001fba34 lld::elf::BitcodeCompiler::compile() (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x1000cba34)
#24 0x0000000100187cc8 void lld::elf::LinkerDriver::compileBitcodeFiles<llvm::object::ELFType<(llvm::endianness)1, true>>(bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100057cc8)
#25 0x000000010016aac4 void lld::elf::LinkerDriver::link<llvm::object::ELFType<(llvm::endianness)1, true>>(llvm::opt::InputArgList&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10003aac4)
#26 0x0000000100157784 lld::elf::LinkerDriver::linkerMain(llvm::ArrayRef<char const*>) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100027784)
#27 0x0000000100154968 lld::elf::link(llvm::ArrayRef<char const*>, llvm::raw_ostream&, llvm::raw_ostream&, bool, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100024968)
#28 0x00000001005070e0 lld::unsafeLldMain(llvm::ArrayRef<char const*>, llvm::raw_ostream&, llvm::raw_ostream&, llvm::ArrayRef<lld::DriverDef>, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x1003d70e0)
#29 0x0000000100131ee0 lld_main(int, char**, llvm::ToolContext const&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100001ee0)
#30 0x0000000100132624 main (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100002624)
#31 0x000000018d1950e0 

This is with an LTO build at -O3 on rva23u64

@HanKuanChen
Copy link
Contributor Author

Hi, I'm getting a crash after this when building 510.parest_r from SPEC CPU 2017:

Assertion failed: (isa<To>(Val) && "cast<Ty>() argument of incompatible type!"), function cast, file Casting.h, line 578.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: /Users/luke/Developer/llvm-project/build.release/bin/ld.lld --sysroot=/Users/luke/Developer/riscv64-linux-gnu --hash-style=gnu --eh-frame-hdr -m elf64lriscv -X -pie -dynamic-linker /lib/ld-linux-riscv64-lp64d.so.1 ... -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /Users/luke/Developer/riscv64-linux-gnu/lib/crtendS.o /Users/luke/Developer/riscv64-linux-gnu/lib/crtn.o -march=rva23u64 -O3
1.	Running pass "function<eager-inv>(loop-mssa(licm<allowspeculation>),gvn<>,memcpyopt,dse,move-auto-init,mldst-motion<no-split-footer-bb>,loop(indvars,loop-deletion,loop-unroll-full),loop-distribute,loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,infer-alignment,loop-unroll<O3>,transform-warning,sroa<preserve-cfg>,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sccp,instcombine<max-iterations=1;no-verify-fixpoint>,bdce,slp-vectorizer,vector-combine,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,jump-threading)" on module "ld-temp.o"
2.	Running pass "slp-vectorizer" on function "_ZNK6dealii7FE_PolyINS_24TensorProductPolynomialsILi3EEELi3ELi3EE8get_dataENS_11UpdateFlagsERKNS_7MappingILi3ELi3EEERKNS_10QuadratureILi3EEE"
 #0 0x0000000102f62368 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e32368)
 #1 0x0000000102f60330 llvm::sys::RunSignalHandlers() (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e30330)
 #2 0x0000000102f62a0c SignalHandler(int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102e32a0c)
 #3 0x000000018d545a24 (/usr/lib/system/libsystem_platform.dylib+0x18046da24)
 #4 0x000000018d515cc0 (/usr/lib/system/libsystem_pthread.dylib+0x18043dcc0)
 #5 0x000000018d421a40 (/usr/lib/system/libsystem_c.dylib+0x180349a40)
 #6 0x000000018d420d30 (/usr/lib/system/libsystem_c.dylib+0x180348d30)
 #7 0x0000000101c99414 llvm::slpvectorizer::BoUpSLP::TreeEntry::setOperand(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP const&, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b69414)
 #8 0x0000000101c8de6c llvm::slpvectorizer::BoUpSLP::buildTree_rec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b5de6c)
 #9 0x0000000101c8f4a4 llvm::slpvectorizer::BoUpSLP::buildTree_rec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b5f4a4)
#10 0x0000000101cd0a68 llvm::SLPVectorizerPass::vectorizeStoreChain(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP&, unsigned int, unsigned int, unsigned int&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba0a68)
#11 0x0000000101cd2ba4 llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::$_0::operator()(std::__1::set<std::__1::pair<unsigned int, int>, llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::StoreDistCompare, std::__1::allocator<std::__1::pair<unsigned int, int>>> const&) const (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba2ba4)
#12 0x0000000101cd1d7c llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__1::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101ba1d7c)
#13 0x0000000101ccdcb8 llvm::SLPVectorizerPass::vectorizeStoreChains(llvm::slpvectorizer::BoUpSLP&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9dcb8)
#14 0x0000000101cccc1c llvm::SLPVectorizerPass::runImpl(llvm::Function&, llvm::ScalarEvolution*, llvm::TargetTransformInfo*, llvm::TargetLibraryInfo*, llvm::AAResults*, llvm::LoopInfo*, llvm::DominatorTree*, llvm::AssumptionCache*, llvm::DemandedBits*, llvm::OptimizationRemarkEmitter*) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9cc1c)
#15 0x0000000101ccc298 llvm::SLPVectorizerPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101b9c298)
#16 0x0000000102d9fe78 llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c6fe78)
#17 0x0000000102da42e8 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c742e8)
#18 0x0000000102d9f0a4 llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x102c6f0a4)
#19 0x00000001011bacec llvm::lto::opt(llvm::lto::Config const&, llvm::TargetMachine*, unsigned int, llvm::Module&, bool, llvm::ModuleSummaryIndex*, llvm::ModuleSummaryIndex const*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>> const&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10108acec)
#20 0x00000001011bb944 llvm::lto::backend(llvm::lto::Config const&, std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>, unsigned int, llvm::Module&, llvm::ModuleSummaryIndex&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10108b944)
#21 0x00000001011a4ccc llvm::lto::LTO::runRegularLTO(std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101074ccc)
#22 0x00000001011a4040 llvm::lto::LTO::run(std::__1::function<llvm::Expected<std::__1::unique_ptr<llvm::CachedFileStream, std::__1::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>, llvm::FileCache) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x101074040)
#23 0x00000001001fba34 lld::elf::BitcodeCompiler::compile() (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x1000cba34)
#24 0x0000000100187cc8 void lld::elf::LinkerDriver::compileBitcodeFiles<llvm::object::ELFType<(llvm::endianness)1, true>>(bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100057cc8)
#25 0x000000010016aac4 void lld::elf::LinkerDriver::link<llvm::object::ELFType<(llvm::endianness)1, true>>(llvm::opt::InputArgList&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x10003aac4)
#26 0x0000000100157784 lld::elf::LinkerDriver::linkerMain(llvm::ArrayRef<char const*>) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100027784)
#27 0x0000000100154968 lld::elf::link(llvm::ArrayRef<char const*>, llvm::raw_ostream&, llvm::raw_ostream&, bool, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100024968)
#28 0x00000001005070e0 lld::unsafeLldMain(llvm::ArrayRef<char const*>, llvm::raw_ostream&, llvm::raw_ostream&, llvm::ArrayRef<lld::DriverDef>, bool) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x1003d70e0)
#29 0x0000000100131ee0 lld_main(int, char**, llvm::ToolContext const&) (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100001ee0)
#30 0x0000000100132624 main (/Users/luke/Developer/llvm-project/build.release/bin/lld+0x100002624)
#31 0x000000018d1950e0 

This is with an LTO build at -O3 on rva23u64

Do you have a test?

@lukel97
Copy link
Contributor

lukel97 commented Dec 6, 2024

No, it's with an LTO build so there's no single .ll or .c file I can pick out to reduce yet. Do you have access to SPEC CPU 2017?

@HanKuanChen
Copy link
Contributor Author

Cannot work with -Wl,-plugin-opt=-opt-bisect-limit=-1 and -Wl,-plugin-opt=-print-before-pass-number=?
-Wl,-plugin-opt=-print-module-scope is also useful.

@lukel97
Copy link
Contributor

lukel97 commented Dec 6, 2024

Oh thanks, I was able to get the whole module out. Running llvm-reduce on it now.

@lukel97
Copy link
Contributor

lukel97 commented Dec 6, 2024

target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"

define ptr @_ZNK6dealii7FE_PolyINS_24TensorProductPolynomialsILi3EEELi3ELi3EE8get_dataENS_11UpdateFlagsERKNS_7MappingILi3ELi3EEERKNS_10QuadratureILi3EEE() #0 personality ptr null {
  store double poison, ptr null, align 8
  %1 = getelementptr i8, ptr null, i64 8
  %2 = fmul double 0.000000e+00, 0.000000e+00
  store double %2, ptr %1, align 8
  %3 = getelementptr i8, ptr null, i64 16
  %4 = fmul double 0.000000e+00, 0.000000e+00
  store double %4, ptr %3, align 8
  %5 = getelementptr i8, ptr null, i64 24
  store double %2, ptr %5, align 8
  ret ptr null
}

; uselistorder directives
uselistorder ptr null, { 1, 2, 3, 4, 5, 6, 7, 0 }

attributes #0 = { "target-features"="+64bit,+a,+b,+c,+d,+experimental,+f,+m,+relax,+supm,+v,+za64rs,+zaamo,+zalrsc,+zawrs,+zba,+zbb,+zbs,+zca,+zcb,+zcmop,+zfa,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicond,+zicsr,+zihintntl,+zihintpause,+zihpm,+zimop,+zkt,+zmmul,+zvbb,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvfhmin,+zvkb,+zvkt,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqcia,-experimental-xqcicsr,-experimental-xqcisls,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-h,-sha,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zabha,-zacas,-zama16b,-zbc,-zbkb,-zbkc,-zbkx,-zcd,-zce,-zcf,-zcmp,-zcmt,-zdinx,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-zifencei,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }

Can reproduce the crash with opt -passes=slp-vectorizer

nikic added a commit that referenced this pull request Dec 6, 2024
…nds. (#113880)"

This reverts commit 94fbe7e.

Causes a crash when linking mafft in ReleaseLTO-g config.
@nikic
Copy link
Contributor

nikic commented Dec 6, 2024

Reverted this because it breaks the llvm-test-suite build in ReleaseLTO-g configuration.

HanKuanChen added a commit to HanKuanChen/llvm-project that referenced this pull request Dec 10, 2024
…vm#113880)

To reduce repeated code, TreeEntry::setOperandsInOrder will be replaced
by VLOperands.
Arg_size will be provided to make sure other operands will not be
reorderd when VL[0] is IntrinsicInst (because APO is a boolean value).
In addition, BoUpSLP::reorderInputsAccordingToOpcode will also be
removed since it is simple.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

6 participants