llvm · ruiling · Jun 5, 2025 · Mar 21, 2025 · May 19, 2025 · May 19, 2025
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -304,10 +304,6 @@ class LLVM_ABI ScheduleDAGMI : public ScheduleDAGInstrs {
   /// The bottom of the unscheduled zone.
   MachineBasicBlock::iterator CurrentBottom;
 
-  /// Record the next node in a scheduled cluster.
-  const SUnit *NextClusterPred = nullptr;
-  const SUnit *NextClusterSucc = nullptr;
-
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
   /// The number of instructions scheduled so far. Used to cut off the
   /// scheduler at the point determined by misched-cutoff.
@@ -368,10 +364,6 @@ class LLVM_ABI ScheduleDAGMI : public ScheduleDAGInstrs {
   /// live ranges and region boundary iterators.
   void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);
 
-  const SUnit *getNextClusterPred() const { return NextClusterPred; }
-
-  const SUnit *getNextClusterSucc() const { return NextClusterSucc; }
-
   void viewGraph(const Twine &Name, const Twine &Title) override;
   void viewGraph() override;
 
@@ -1295,6 +1287,9 @@ class LLVM_ABI GenericScheduler : public GenericSchedulerBase {
   SchedBoundary Top;
   SchedBoundary Bot;
 
+  ClusterInfo *TopCluster;
+  ClusterInfo *BotCluster;
+
   /// Candidate last picked from Top boundary.
   SchedCandidate TopCand;
   /// Candidate last picked from Bot boundary.
@@ -1335,6 +1330,9 @@ class LLVM_ABI PostGenericScheduler : public GenericSchedulerBase {
   /// Candidate last picked from Bot boundary.
   SchedCandidate BotCand;
 
+  ClusterInfo *TopCluster;
+  ClusterInfo *BotCluster;
+
 public:
   PostGenericScheduler(const MachineSchedContext *C)
       : GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ"),

diff --git a/llvm/include/llvm/CodeGen/ScheduleDAG.h b/llvm/include/llvm/CodeGen/ScheduleDAG.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -235,6 +236,10 @@ class TargetRegisterInfo;
     LLVM_ABI void dump(const TargetRegisterInfo *TRI = nullptr) const;
   };
 
+  /// Keep record of which SUnit are in the same cluster group.
+  typedef SmallSet<SUnit *, 8> ClusterInfo;
+  constexpr unsigned InvalidClusterId = ~0u;
+
   /// Scheduling unit. This is a node in the scheduling DAG.
   class SUnit {
   private:
@@ -275,6 +280,8 @@ class TargetRegisterInfo;
     unsigned TopReadyCycle = 0; ///< Cycle relative to start when node is ready.
     unsigned BotReadyCycle = 0; ///< Cycle relative to end when node is ready.
 
+    unsigned ParentClusterIdx = InvalidClusterId; ///< The parent cluster id.
+
   private:
     unsigned Depth = 0;  ///< Node depth.
     unsigned Height = 0; ///< Node height.

diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -181,6 +181,8 @@ namespace llvm {
     /// case of a huge region that gets reduced).
     SUnit *BarrierChain = nullptr;
 
+    SmallVector<ClusterInfo> Clusters;
+
   public:
     /// A list of SUnits, used in Value2SUsMap, during DAG construction.
     /// Note: to gain speed it might be worth investigating an optimized
@@ -384,6 +386,14 @@ namespace llvm {
     /// equivalent edge already existed (false indicates failure).
     bool addEdge(SUnit *SuccSU, const SDep &PredDep);
 
+    /// Returns the array of the clusters.
+    SmallVector<ClusterInfo> &getClusters() { return Clusters; }
+
+    /// Get the specific cluster, return nullptr for InvalidClusterId.
+    ClusterInfo *getCluster(unsigned Idx) {
+      return Idx != InvalidClusterId ? &Clusters[Idx] : nullptr;
+    }
+
   protected:
     void initSUnits();
     void addPhysRegDataDeps(SUnit *SU, unsigned OperIdx);

diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/PriorityQueue.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -943,8 +944,6 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
 
   if (SuccEdge->isWeak()) {
     --SuccSU->WeakPredsLeft;
-    if (SuccEdge->isCluster())
-      NextClusterSucc = SuccSU;
     return;
   }
 #ifndef NDEBUG
@@ -967,12 +966,6 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
 
 /// releaseSuccessors - Call releaseSucc on each of SU's successors.
 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
-  // Reset the next successor, For example, we want to cluster A B C.
-  // After A is picked, we will set B as next cluster succ, but if we pick
-  // D instead of B after A, then we need to reset the next cluster succ because
-  // we have decided to not pick the cluster candidate B during pickNode().
-  // Leaving B as the NextClusterSucc just make things messy.
-  NextClusterSucc = nullptr;
   for (SDep &Succ : SU->Succs)
     releaseSucc(SU, &Succ);
 }
@@ -986,8 +979,6 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
 
   if (PredEdge->isWeak()) {
     --PredSU->WeakSuccsLeft;
-    if (PredEdge->isCluster())
-      NextClusterPred = PredSU;
     return;
   }
 #ifndef NDEBUG
@@ -1010,7 +1001,6 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
 
 /// releasePredecessors - Call releasePred on each of SU's predecessors.
 void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
-  NextClusterPred = nullptr;
   for (SDep &Pred : SU->Preds)
     releasePred(SU, &Pred);
 }
@@ -1184,11 +1174,8 @@ findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
 }
 
 /// Identify DAG roots and setup scheduler queues.
-void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
-                               ArrayRef<SUnit*> BotRoots) {
-  NextClusterSucc = nullptr;
-  NextClusterPred = nullptr;
-
+void ScheduleDAGMI::initQueues(ArrayRef<SUnit *> TopRoots,
+                               ArrayRef<SUnit *> BotRoots) {
   // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
   //
   // Nodes with unreleased weak edges can still be roots.
@@ -2116,6 +2103,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     ScheduleDAGInstrs *DAG) {
   // Keep track of the current cluster length and bytes for each SUnit.
   DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
+  EquivalenceClasses<SUnit *> Clusters;
 
   // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
   // cluster mem ops collected within `MemOpRecords` array.
@@ -2155,13 +2143,15 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
 
     SUnit *SUa = MemOpa.SU;
     SUnit *SUb = MemOpb.SU;
+
     if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
       std::swap(SUa, SUb);
 
     // FIXME: Is this check really required?
     if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
       continue;
 
+    Clusters.unionSets(SUa, SUb);
     LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
                       << SUb->NodeNum << ")\n");
     ++NumClustered;
@@ -2201,6 +2191,21 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
                       << ", Curr cluster bytes: " << CurrentClusterBytes
                       << "\n");
   }
+
+  // Add cluster group information.
+  // Iterate over all of the equivalence sets.
+  auto &AllClusters = DAG->getClusters();
+  for (const EquivalenceClasses<SUnit *>::ECValue *I : Clusters) {
+    if (!I->isLeader())
+      continue;
+    ClusterInfo Group;
+    unsigned ClusterIdx = AllClusters.size();
+    for (SUnit *MemberI : Clusters.members(*I)) {
+      MemberI->ParentClusterIdx = ClusterIdx;
+      Group.insert(MemberI);
+    }
+    AllClusters.push_back(Group);
+  }
 }
 
 void BaseMemOpClusterMutation::collectMemOpRecords(
@@ -3688,6 +3693,9 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
   }
   TopCand.SU = nullptr;
   BotCand.SU = nullptr;
+
+  TopCluster = nullptr;
+  BotCluster = nullptr;
 }
 
 /// Initialize the per-region scheduling policy.
@@ -3997,13 +4005,11 @@ bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
   // This is a best effort to set things up for a post-RA pass. Optimizations
   // like generating loads of multiple registers should ideally be done within
   // the scheduler pass by combining the loads during DAG postprocessing.
-  const SUnit *CandNextClusterSU =
-    Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
-  const SUnit *TryCandNextClusterSU =
-    TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
-  if (tryGreater(TryCand.SU == TryCandNextClusterSU,
-                 Cand.SU == CandNextClusterSU,
-                 TryCand, Cand, Cluster))
+  const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
+  const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
+  if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
+                 CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
+                 Cluster))
     return TryCand.Reason != NoCand;
 
   if (SameBoundary) {
@@ -4262,11 +4268,25 @@ void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
 void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
   if (IsTopNode) {
     SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
+    TopCluster = DAG->getCluster(SU->ParentClusterIdx);
+    LLVM_DEBUG(if (TopCluster) {
+      dbgs() << "  Top Cluster: ";
+      for (auto *N : *TopCluster)
+        dbgs() << N->NodeNum << '\t';
+      dbgs() << '\n';
+    });
     Top.bumpNode(SU);
     if (SU->hasPhysRegUses)
       reschedulePhysReg(SU, true);
   } else {
     SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
+    BotCluster = DAG->getCluster(SU->ParentClusterIdx);
+    LLVM_DEBUG(if (BotCluster) {
+      dbgs() << "  Bot Cluster: ";
+      for (auto *N : *BotCluster)
+        dbgs() << N->NodeNum << '\t';
+      dbgs() << '\n';
+    });
     Bot.bumpNode(SU);
     if (SU->hasPhysRegDefs)
       reschedulePhysReg(SU, false);
@@ -4303,6 +4323,8 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
   if (!Bot.HazardRec) {
     Bot.HazardRec = DAG->TII->CreateTargetMIHazardRecognizer(Itin, DAG);
   }
+  TopCluster = nullptr;
+  BotCluster = nullptr;
 }
 
 void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
@@ -4367,14 +4389,12 @@ bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
     return TryCand.Reason != NoCand;
 
   // Keep clustered nodes together.
-  const SUnit *CandNextClusterSU =
-      Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
-  const SUnit *TryCandNextClusterSU =
-      TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
-  if (tryGreater(TryCand.SU == TryCandNextClusterSU,
-                 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
+  const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
+  const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
+  if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
+                 CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
+                 Cluster))
     return TryCand.Reason != NoCand;
-
   // Avoid critical resource consumption and balance the schedule.
   if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
               TryCand, Cand, ResourceReduce))
@@ -4571,9 +4591,11 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
 void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
   if (IsTopNode) {
     SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
+    TopCluster = DAG->getCluster(SU->ParentClusterIdx);
     Top.bumpNode(SU);
   } else {
     SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
+    BotCluster = DAG->getCluster(SU->ParentClusterIdx);
     Bot.bumpNode(SU);
   }
 }

diff --git a/llvm/lib/CodeGen/MacroFusion.cpp b/llvm/lib/CodeGen/MacroFusion.cpp
@@ -61,6 +61,11 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
   for (SDep &SI : SecondSU.Preds)
     if (SI.isCluster())
       return false;
+
+  unsigned FirstCluster = FirstSU.ParentClusterIdx;
+  unsigned SecondCluster = SecondSU.ParentClusterIdx;
+  assert(FirstCluster == InvalidClusterId && SecondCluster == InvalidClusterId);
+
   // Though the reachability checks above could be made more generic,
   // perhaps as part of ScheduleDAGInstrs::addEdge(), since such edges are valid,
   // the extra computation cost makes it less interesting in general cases.
@@ -70,6 +75,14 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
   if (!DAG.addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster)))
     return false;
 
+  auto &Clusters = DAG.getClusters();
+
+  FirstSU.ParentClusterIdx = Clusters.size();
+  SecondSU.ParentClusterIdx = Clusters.size();
+
+  SmallSet<SUnit *, 8> Cluster{{&FirstSU, &SecondSU}};
+  Clusters.push_back(Cluster);
+
   // TODO - If we want to chain more than two instructions, we need to create
   // artifical edges to make dependencies from the FirstSU also dependent
   // on other chained instructions, and other chained instructions also

diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -365,6 +365,9 @@ LLVM_DUMP_METHOD void ScheduleDAG::dumpNodeName(const SUnit &SU) const {
 LLVM_DUMP_METHOD void ScheduleDAG::dumpNodeAll(const SUnit &SU) const {
   dumpNode(SU);
   SU.dumpAttributes();
+  if (SU.ParentClusterIdx != InvalidClusterId)
+    dbgs() << "  Parent Cluster Index: " << SU.ParentClusterIdx << '\n';
+
   if (SU.Preds.size() > 0) {
     dbgs() << "  Predecessors:\n";
     for (const SDep &Dep : SU.Preds) {

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -589,12 +589,11 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
   // This is a best effort to set things up for a post-RA pass. Optimizations
   // like generating loads of multiple registers should ideally be done within
   // the scheduler pass by combining the loads during DAG postprocessing.
-  const SUnit *CandNextClusterSU =
-      Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
-  const SUnit *TryCandNextClusterSU =
-      TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
-  if (tryGreater(TryCand.SU == TryCandNextClusterSU,
-                 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
+  const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
+  const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
+  if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
+                 CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
+                 Cluster))
     return TryCand.Reason != NoCand;
 
   // Avoid increasing the max critical pressure in the scheduled region.
@@ -664,12 +663,11 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
 
   // MaxMemoryClause-specific: We prioritize clustered instructions as we would
   // get more benefit from clausing these memory instructions.
-  const SUnit *CandNextClusterSU =
-      Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
-  const SUnit *TryCandNextClusterSU =
-      TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
-  if (tryGreater(TryCand.SU == TryCandNextClusterSU,
-                 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
+  const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
+  const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
+  if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
+                 CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
+                 Cluster))
     return TryCand.Reason != NoCand;
 
   // We only compare a subset of features when comparing nodes between