Skip to content

MachineScheduler: Improve instruction clustering #137784

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions llvm/include/llvm/CodeGen/MachineScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,6 @@ class LLVM_ABI ScheduleDAGMI : public ScheduleDAGInstrs {
/// The bottom of the unscheduled zone.
MachineBasicBlock::iterator CurrentBottom;

/// Record the next node in a scheduled cluster.
const SUnit *NextClusterPred = nullptr;
const SUnit *NextClusterSucc = nullptr;

#if LLVM_ENABLE_ABI_BREAKING_CHECKS
/// The number of instructions scheduled so far. Used to cut off the
/// scheduler at the point determined by misched-cutoff.
Expand Down Expand Up @@ -368,10 +364,6 @@ class LLVM_ABI ScheduleDAGMI : public ScheduleDAGInstrs {
/// live ranges and region boundary iterators.
void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);

const SUnit *getNextClusterPred() const { return NextClusterPred; }

const SUnit *getNextClusterSucc() const { return NextClusterSucc; }

void viewGraph(const Twine &Name, const Twine &Title) override;
void viewGraph() override;

Expand Down Expand Up @@ -1295,6 +1287,9 @@ class LLVM_ABI GenericScheduler : public GenericSchedulerBase {
SchedBoundary Top;
SchedBoundary Bot;

ClusterInfo *TopCluster;
ClusterInfo *BotCluster;

/// Candidate last picked from Top boundary.
SchedCandidate TopCand;
/// Candidate last picked from Bot boundary.
Expand Down Expand Up @@ -1335,6 +1330,9 @@ class LLVM_ABI PostGenericScheduler : public GenericSchedulerBase {
/// Candidate last picked from Bot boundary.
SchedCandidate BotCand;

ClusterInfo *TopCluster;
ClusterInfo *BotCluster;

public:
PostGenericScheduler(const MachineSchedContext *C)
: GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ"),
Expand Down
7 changes: 7 additions & 0 deletions llvm/include/llvm/CodeGen/ScheduleDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/PointerIntPair.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/iterator.h"
#include "llvm/CodeGen/MachineInstr.h"
Expand Down Expand Up @@ -235,6 +236,10 @@ class TargetRegisterInfo;
LLVM_ABI void dump(const TargetRegisterInfo *TRI = nullptr) const;
};

/// Keep record of which SUnit are in the same cluster group.
typedef SmallSet<SUnit *, 8> ClusterInfo;
constexpr unsigned InvalidClusterId = ~0u;

/// Scheduling unit. This is a node in the scheduling DAG.
class SUnit {
private:
Expand Down Expand Up @@ -275,6 +280,8 @@ class TargetRegisterInfo;
unsigned TopReadyCycle = 0; ///< Cycle relative to start when node is ready.
unsigned BotReadyCycle = 0; ///< Cycle relative to end when node is ready.

unsigned ParentClusterIdx = InvalidClusterId; ///< The parent cluster id.

private:
unsigned Depth = 0; ///< Node depth.
unsigned Height = 0; ///< Node height.
Expand Down
10 changes: 10 additions & 0 deletions llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ namespace llvm {
/// case of a huge region that gets reduced).
SUnit *BarrierChain = nullptr;

SmallVector<ClusterInfo> Clusters;

public:
/// A list of SUnits, used in Value2SUsMap, during DAG construction.
/// Note: to gain speed it might be worth investigating an optimized
Expand Down Expand Up @@ -384,6 +386,14 @@ namespace llvm {
/// equivalent edge already existed (false indicates failure).
bool addEdge(SUnit *SuccSU, const SDep &PredDep);

/// Returns the array of the clusters.
SmallVector<ClusterInfo> &getClusters() { return Clusters; }

/// Get the specific cluster, return nullptr for InvalidClusterId.
ClusterInfo *getCluster(unsigned Idx) {
return Idx != InvalidClusterId ? &Clusters[Idx] : nullptr;
}

protected:
void initSUnits();
void addPhysRegDataDeps(SUnit *SU, unsigned OperIdx);
Expand Down
82 changes: 52 additions & 30 deletions llvm/lib/CodeGen/MachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
Expand Down Expand Up @@ -943,8 +944,6 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {

if (SuccEdge->isWeak()) {
--SuccSU->WeakPredsLeft;
if (SuccEdge->isCluster())
NextClusterSucc = SuccSU;
return;
}
#ifndef NDEBUG
Expand All @@ -967,12 +966,6 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {

/// releaseSuccessors - Call releaseSucc on each of SU's successors.
void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
// Reset the next successor, For example, we want to cluster A B C.
// After A is picked, we will set B as next cluster succ, but if we pick
// D instead of B after A, then we need to reset the next cluster succ because
// we have decided to not pick the cluster candidate B during pickNode().
// Leaving B as the NextClusterSucc just make things messy.
NextClusterSucc = nullptr;
for (SDep &Succ : SU->Succs)
releaseSucc(SU, &Succ);
}
Expand All @@ -986,8 +979,6 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {

if (PredEdge->isWeak()) {
--PredSU->WeakSuccsLeft;
if (PredEdge->isCluster())
NextClusterPred = PredSU;
return;
}
#ifndef NDEBUG
Expand All @@ -1010,7 +1001,6 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {

/// releasePredecessors - Call releasePred on each of SU's predecessors.
void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
NextClusterPred = nullptr;
for (SDep &Pred : SU->Preds)
releasePred(SU, &Pred);
}
Expand Down Expand Up @@ -1184,11 +1174,8 @@ findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
}

/// Identify DAG roots and setup scheduler queues.
void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
ArrayRef<SUnit*> BotRoots) {
NextClusterSucc = nullptr;
NextClusterPred = nullptr;

void ScheduleDAGMI::initQueues(ArrayRef<SUnit *> TopRoots,
ArrayRef<SUnit *> BotRoots) {
// Release all DAG roots for scheduling, not including EntrySU/ExitSU.
//
// Nodes with unreleased weak edges can still be roots.
Expand Down Expand Up @@ -2116,6 +2103,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
ScheduleDAGInstrs *DAG) {
// Keep track of the current cluster length and bytes for each SUnit.
DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
EquivalenceClasses<SUnit *> Clusters;

// At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
// cluster mem ops collected within `MemOpRecords` array.
Expand Down Expand Up @@ -2155,13 +2143,15 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(

SUnit *SUa = MemOpa.SU;
SUnit *SUb = MemOpb.SU;

if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
std::swap(SUa, SUb);

// FIXME: Is this check really required?
if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
continue;

Clusters.unionSets(SUa, SUb);
LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
<< SUb->NodeNum << ")\n");
++NumClustered;
Expand Down Expand Up @@ -2201,6 +2191,21 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
<< ", Curr cluster bytes: " << CurrentClusterBytes
<< "\n");
}

// Add cluster group information.
// Iterate over all of the equivalence sets.
auto &AllClusters = DAG->getClusters();
for (const EquivalenceClasses<SUnit *>::ECValue *I : Clusters) {
if (!I->isLeader())
continue;
ClusterInfo Group;
unsigned ClusterIdx = AllClusters.size();
for (SUnit *MemberI : Clusters.members(*I)) {
MemberI->ParentClusterIdx = ClusterIdx;
Group.insert(MemberI);
}
AllClusters.push_back(Group);
}
}

void BaseMemOpClusterMutation::collectMemOpRecords(
Expand Down Expand Up @@ -3688,6 +3693,9 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
}
TopCand.SU = nullptr;
BotCand.SU = nullptr;

TopCluster = nullptr;
BotCluster = nullptr;
}

/// Initialize the per-region scheduling policy.
Expand Down Expand Up @@ -3997,13 +4005,11 @@ bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU,
TryCand, Cand, Cluster))
const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;

if (SameBoundary) {
Expand Down Expand Up @@ -4262,11 +4268,25 @@ void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
if (IsTopNode) {
SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
TopCluster = DAG->getCluster(SU->ParentClusterIdx);
LLVM_DEBUG(if (TopCluster) {
dbgs() << " Top Cluster: ";
for (auto *N : *TopCluster)
dbgs() << N->NodeNum << '\t';
dbgs() << '\n';
});
Top.bumpNode(SU);
if (SU->hasPhysRegUses)
reschedulePhysReg(SU, true);
} else {
SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
BotCluster = DAG->getCluster(SU->ParentClusterIdx);
LLVM_DEBUG(if (BotCluster) {
dbgs() << " Bot Cluster: ";
for (auto *N : *BotCluster)
dbgs() << N->NodeNum << '\t';
dbgs() << '\n';
});
Bot.bumpNode(SU);
if (SU->hasPhysRegDefs)
reschedulePhysReg(SU, false);
Expand Down Expand Up @@ -4303,6 +4323,8 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
if (!Bot.HazardRec) {
Bot.HazardRec = DAG->TII->CreateTargetMIHazardRecognizer(Itin, DAG);
}
TopCluster = nullptr;
BotCluster = nullptr;
}

void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
Expand Down Expand Up @@ -4367,14 +4389,12 @@ bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
return TryCand.Reason != NoCand;

// Keep clustered nodes together.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;

// Avoid critical resource consumption and balance the schedule.
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
Expand Down Expand Up @@ -4571,9 +4591,11 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
if (IsTopNode) {
SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
TopCluster = DAG->getCluster(SU->ParentClusterIdx);
Top.bumpNode(SU);
} else {
SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
BotCluster = DAG->getCluster(SU->ParentClusterIdx);
Bot.bumpNode(SU);
}
}
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/CodeGen/MacroFusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
for (SDep &SI : SecondSU.Preds)
if (SI.isCluster())
return false;

unsigned FirstCluster = FirstSU.ParentClusterIdx;
unsigned SecondCluster = SecondSU.ParentClusterIdx;
assert(FirstCluster == InvalidClusterId && SecondCluster == InvalidClusterId);

// Though the reachability checks above could be made more generic,
// perhaps as part of ScheduleDAGInstrs::addEdge(), since such edges are valid,
// the extra computation cost makes it less interesting in general cases.
Expand All @@ -70,6 +75,14 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
if (!DAG.addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster)))
return false;

auto &Clusters = DAG.getClusters();

FirstSU.ParentClusterIdx = Clusters.size();
SecondSU.ParentClusterIdx = Clusters.size();

SmallSet<SUnit *, 8> Cluster{{&FirstSU, &SecondSU}};
Clusters.push_back(Cluster);

// TODO - If we want to chain more than two instructions, we need to create
// artifical edges to make dependencies from the FirstSU also dependent
// on other chained instructions, and other chained instructions also
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/CodeGen/ScheduleDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,9 @@ LLVM_DUMP_METHOD void ScheduleDAG::dumpNodeName(const SUnit &SU) const {
LLVM_DUMP_METHOD void ScheduleDAG::dumpNodeAll(const SUnit &SU) const {
dumpNode(SU);
SU.dumpAttributes();
if (SU.ParentClusterIdx != InvalidClusterId)
dbgs() << " Parent Cluster Index: " << SU.ParentClusterIdx << '\n';

if (SU.Preds.size() > 0) {
dbgs() << " Predecessors:\n";
for (const SDep &Dep : SU.Preds) {
Expand Down
22 changes: 10 additions & 12 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -589,12 +589,11 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;

// Avoid increasing the max critical pressure in the scheduled region.
Expand Down Expand Up @@ -664,12 +663,11 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,

// MaxMemoryClause-specific: We prioritize clustered instructions as we would
// get more benefit from clausing these memory instructions.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;

// We only compare a subset of features when comparing nodes between
Expand Down
Loading