Skip to content

[AMDGPU] Teach iterative schedulers about IGLP #134953

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -616,12 +616,15 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
return DAG;
}

static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
return new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
auto *DAG = new GCNIterativeScheduler(
C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
return DAG;
}

static ScheduleDAGInstrs *
Expand All @@ -632,6 +635,7 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
return DAG;
}

Expand Down
49 changes: 40 additions & 9 deletions llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//

#include "GCNIterativeScheduler.h"
#include "AMDGPUIGroupLP.h"
#include "GCNSchedStrategy.h"
#include "SIMachineFunctionInfo.h"

Expand Down Expand Up @@ -118,21 +119,42 @@ void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
}
#endif

void GCNIterativeScheduler::swapIGLPMutations(const Region &R, bool IsReentry) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess with less stages it's not worth caching this result like in the default scheduler?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that was my thought.

For the multi-stage scheduling approach it makes sense to cache them for facilitating different strategies based on mutation selection. But for these schedulers, it seems we should either apply them or not.

bool HasIGLPInstrs = false;
const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(TII);
for (MachineBasicBlock::iterator I = R.Begin; I != R.End; I++) {
if (SII->isIGLPMutationOnly(I->getOpcode())) {
HasIGLPInstrs = true;
break;
}
}

if (HasIGLPInstrs) {
SavedMutations.clear();
SavedMutations.swap(Mutations);
auto SchedPhase = IsReentry ? AMDGPU::SchedulingPhase::PreRAReentry
: AMDGPU::SchedulingPhase::Initial;

addMutation(createIGroupLPDAGMutation(SchedPhase));
}
}

// DAG builder helper
class GCNIterativeScheduler::BuildDAG {
GCNIterativeScheduler &Sch;
SmallVector<SUnit *, 8> TopRoots;

SmallVector<SUnit*, 8> BotRoots;
public:
BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
: Sch(_Sch) {
BuildDAG(const Region &R, GCNIterativeScheduler &_Sch, bool IsReentry = false)
: Sch(_Sch) {
auto *BB = R.Begin->getParent();
Sch.BaseClass::startBlock(BB);
Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);

Sch.swapIGLPMutations(R, IsReentry);
Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
/*TrackLaneMask*/true);
Sch.postProcessDAG();
Sch.Topo.InitDAGTopologicalSorting();
Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
}
Expand Down Expand Up @@ -432,13 +454,15 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {

auto NewOcc = TargetOcc;
for (auto *R : Regions) {
// Always build the DAG to add mutations
BuildDAG DAG(*R, *this);

if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
break;
continue;

LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
printLivenessInfo(dbgs(), R->Begin, R->End, LIS));

BuildDAG DAG(*R, *this);
const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
const auto MaxRP = getSchedulePressure(*R, MinSchedule);
LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
Expand Down Expand Up @@ -469,8 +493,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);

if (TryMaximizeOccupancy && Occ < TgtOcc)
bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) {
Occ = tryMaximizeOccupancy(TgtOcc);
IsReentry = true;
}

// This is really weird but for some magic scheduling regions twice
// gives performance improvement
Expand All @@ -489,7 +516,8 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc);
for (auto *R : Regions) {
OverrideLegacyStrategy Ovr(*R, LStrgy, *this);

IsReentry |= I > 0;
swapIGLPMutations(*R, IsReentry);
Ovr.schedule();
const auto RP = getRegionPressure(*R);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
Expand Down Expand Up @@ -556,8 +584,11 @@ void GCNIterativeScheduler::scheduleILP(
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);

if (TryMaximizeOccupancy && Occ < TgtOcc)
bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) {
Occ = tryMaximizeOccupancy(TgtOcc);
IsReentry = true;
}

TgtOcc = std::min(Occ, TgtOcc);
LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
Expand All @@ -566,7 +597,7 @@ void GCNIterativeScheduler::scheduleILP(

unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
for (auto *R : Regions) {
BuildDAG DAG(*R, *this);
BuildDAG DAG(*R, *this, IsReentry);
const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this);

const auto RP = getSchedulePressure(*R, ILPSchedule);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ class GCNIterativeScheduler : public ScheduleDAGMILive {
const StrategyKind Strategy;
mutable GCNUpwardRPTracker UPTracker;

std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;

class BuildDAG;
class OverrideLegacyStrategy;

Expand All @@ -91,6 +93,7 @@ class GCNIterativeScheduler : public ScheduleDAGMILive {
return getRegionPressure(R.Begin, R.End);
}

void swapIGLPMutations(const Region &R, bool IsReentry);
void setBestSchedule(Region &R,
ScheduleRef Schedule,
const GCNRegPressure &MaxRP = GCNRegPressure());
Expand Down
14 changes: 5 additions & 9 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,12 +188,6 @@ static void getRegisterPressures(
Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
}

// Return true if the instruction is mutually exclusive with all non-IGLP DAG
// mutations, requiring all other mutations to be disabled.
static bool isIGLPMutationOnly(unsigned Opcode) {
return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT;
}

void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop,
const RegPressureTracker &RPTracker,
Expand Down Expand Up @@ -1161,9 +1155,10 @@ bool GCNSchedStage::initGCNRegion() {
Unsched.reserve(DAG.NumRegionInstrs);
if (StageID == GCNSchedStageID::OccInitialSchedule ||
StageID == GCNSchedStageID::ILPInitialSchedule) {
const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
for (auto &I : DAG) {
Unsched.push_back(&I);
if (isIGLPMutationOnly(I.getOpcode()))
if (SII->isIGLPMutationOnly(I.getOpcode()))
DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
}
} else {
Expand Down Expand Up @@ -2047,8 +2042,9 @@ void GCNScheduleDAGMILive::updateRegionBoundaries(
}

static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
return isIGLPMutationOnly(MI->getOpcode());
const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
return SII->isIGLPMutationOnly(MI->getOpcode());
});
}

Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {

bool isIGLP(const MachineInstr &MI) const { return isIGLP(MI.getOpcode()); }

// Return true if the instruction is mutually exclusive with all non-IGLP DAG
// mutations, requiring all other mutations to be disabled.
bool isIGLPMutationOnly(unsigned Opcode) const {
return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT;
}

static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
switch (Opcode) {
case AMDGPU::S_WAITCNT_soft:
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-ilp < %s | FileCheck %s

; Test should not result in build failure
; CHECK-LABEL: shouldNotReApply
Expand Down
Loading
Loading