Skip to content

Commit 2e1c12a

Browse files
authored
[AMDGPU] Allow using iterative schedulers with IGLP (llvm#1658)
Auto-submit by Jenkins
2 parents fb41200 + ba0142e commit 2e1c12a

11 files changed

+1026
-32
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -424,10 +424,10 @@ static cl::opt<bool>
424424
cl::desc("Enable loop data prefetch on AMDGPU"),
425425
cl::Hidden, cl::init(false));
426426

427-
static cl::opt<bool> EnableMaxIlpSchedStrategy(
428-
"amdgpu-enable-max-ilp-scheduling-strategy",
429-
cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
430-
cl::Hidden, cl::init(false));
427+
static cl::opt<std::string>
428+
AMDGPUSchedStrategy("amdgpu-sched-strategy",
429+
cl::desc("Select custom AMDGPU scheduling strategy."),
430+
cl::Hidden, cl::init(""));
431431

432432
static cl::opt<bool> EnableRewritePartialRegUses(
433433
"amdgpu-enable-rewrite-partial-reg-uses",
@@ -563,12 +563,15 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
563563
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
564564
if (ST.shouldClusterStores())
565565
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
566+
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
566567
return DAG;
567568
}
568569

569570
static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
570-
return new GCNIterativeScheduler(C,
571-
GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
571+
auto *DAG = new GCNIterativeScheduler(
572+
C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
573+
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
574+
return DAG;
572575
}
573576

574577
static ScheduleDAGInstrs *
@@ -580,6 +583,7 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
580583
if (ST.shouldClusterStores())
581584
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
582585
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
586+
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
583587
return DAG;
584588
}
585589

@@ -1288,9 +1292,24 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
12881292
if (ST.enableSIScheduler())
12891293
return createSIMachineScheduler(C);
12901294

1291-
if (EnableMaxIlpSchedStrategy)
1295+
Attribute SchedStrategyAttr =
1296+
C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
1297+
StringRef SchedStrategy = SchedStrategyAttr.isValid()
1298+
? SchedStrategyAttr.getValueAsString()
1299+
: AMDGPUSchedStrategy;
1300+
1301+
if (SchedStrategy == "max-ilp")
12921302
return createGCNMaxILPMachineScheduler(C);
12931303

1304+
if (SchedStrategy == "iterative-ilp")
1305+
return createIterativeILPMachineScheduler(C);
1306+
1307+
if (SchedStrategy == "iterative-minreg")
1308+
return createMinRegScheduler(C);
1309+
1310+
if (SchedStrategy == "iterative-maxocc")
1311+
return createIterativeGCNMaxOccupancyMachineScheduler(C);
1312+
12941313
return createGCNMaxOccupancyMachineScheduler(C);
12951314
}
12961315

llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
//===----------------------------------------------------------------------===//
1313

1414
#include "GCNIterativeScheduler.h"
15+
#include "AMDGPUIGroupLP.h"
1516
#include "GCNSchedStrategy.h"
1617
#include "SIMachineFunctionInfo.h"
1718

@@ -118,21 +119,42 @@ void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
118119
}
119120
#endif
120121

122+
void GCNIterativeScheduler::swapIGLPMutations(const Region &R, bool IsReentry) {
123+
bool HasIGLPInstrs = false;
124+
const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(TII);
125+
for (MachineBasicBlock::iterator I = R.Begin; I != R.End; I++) {
126+
if (SII->isIGLPMutationOnly(I->getOpcode())) {
127+
HasIGLPInstrs = true;
128+
break;
129+
}
130+
}
131+
132+
if (HasIGLPInstrs) {
133+
SavedMutations.clear();
134+
SavedMutations.swap(Mutations);
135+
auto SchedPhase = IsReentry ? AMDGPU::SchedulingPhase::PreRAReentry
136+
: AMDGPU::SchedulingPhase::Initial;
137+
138+
addMutation(createIGroupLPDAGMutation(SchedPhase));
139+
}
140+
}
141+
121142
// DAG builder helper
122143
class GCNIterativeScheduler::BuildDAG {
123144
GCNIterativeScheduler &Sch;
124145
SmallVector<SUnit *, 8> TopRoots;
125146

126147
SmallVector<SUnit*, 8> BotRoots;
127148
public:
128-
BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
129-
: Sch(_Sch) {
130-
auto BB = R.Begin->getParent();
149+
BuildDAG(const Region &R, GCNIterativeScheduler &_Sch, bool IsReentry = false)
150+
: Sch(_Sch) {
151+
auto *BB = R.Begin->getParent();
131152
Sch.BaseClass::startBlock(BB);
132153
Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
133-
154+
Sch.swapIGLPMutations(R, IsReentry);
134155
Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
135156
/*TrackLaneMask*/true);
157+
Sch.postProcessDAG();
136158
Sch.Topo.InitDAGTopologicalSorting();
137159
Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
138160
}
@@ -432,13 +454,15 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
432454

433455
auto NewOcc = TargetOcc;
434456
for (auto *R : Regions) {
457+
// Always build the DAG to add mutations
458+
BuildDAG DAG(*R, *this);
459+
435460
if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
436-
break;
461+
continue;
437462

438463
LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
439464
printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
440465

441-
BuildDAG DAG(*R, *this);
442466
const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
443467
const auto MaxRP = getSchedulePressure(*R, MinSchedule);
444468
LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
@@ -469,8 +493,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
469493
sortRegionsByPressure(TgtOcc);
470494
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
471495

472-
if (TryMaximizeOccupancy && Occ < TgtOcc)
496+
bool IsReentry = false;
497+
if (TryMaximizeOccupancy && Occ < TgtOcc) {
473498
Occ = tryMaximizeOccupancy(TgtOcc);
499+
IsReentry = true;
500+
}
474501

475502
// This is really weird but for some magic scheduling regions twice
476503
// gives performance improvement
@@ -489,7 +516,8 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
489516
LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc);
490517
for (auto *R : Regions) {
491518
OverrideLegacyStrategy Ovr(*R, LStrgy, *this);
492-
519+
IsReentry |= I > 0;
520+
swapIGLPMutations(*R, IsReentry);
493521
Ovr.schedule();
494522
const auto RP = getRegionPressure(*R);
495523
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
@@ -556,8 +584,11 @@ void GCNIterativeScheduler::scheduleILP(
556584
sortRegionsByPressure(TgtOcc);
557585
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
558586

559-
if (TryMaximizeOccupancy && Occ < TgtOcc)
587+
bool IsReentry = false;
588+
if (TryMaximizeOccupancy && Occ < TgtOcc) {
560589
Occ = tryMaximizeOccupancy(TgtOcc);
590+
IsReentry = true;
591+
}
561592

562593
TgtOcc = std::min(Occ, TgtOcc);
563594
LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
@@ -566,7 +597,7 @@ void GCNIterativeScheduler::scheduleILP(
566597

567598
unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
568599
for (auto *R : Regions) {
569-
BuildDAG DAG(*R, *this);
600+
BuildDAG DAG(*R, *this, IsReentry);
570601
const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this);
571602

572603
const auto RP = getSchedulePressure(*R, ILPSchedule);

llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ class GCNIterativeScheduler : public ScheduleDAGMILive {
7777
const StrategyKind Strategy;
7878
mutable GCNUpwardRPTracker UPTracker;
7979

80+
std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
81+
8082
class BuildDAG;
8183
class OverrideLegacyStrategy;
8284

@@ -91,6 +93,7 @@ class GCNIterativeScheduler : public ScheduleDAGMILive {
9193
return getRegionPressure(R.Begin, R.End);
9294
}
9395

96+
void swapIGLPMutations(const Region &R, bool IsReentry);
9497
void setBestSchedule(Region &R,
9598
ScheduleRef Schedule,
9699
const GCNRegPressure &MaxRP = GCNRegPressure());

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -188,12 +188,6 @@ static void getRegisterPressures(
188188
Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
189189
}
190190

191-
// Return true if the instruction is mutually exclusive with all non-IGLP DAG
192-
// mutations, requiring all other mutations to be disabled.
193-
static bool isIGLPMutationOnly(unsigned Opcode) {
194-
return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT;
195-
}
196-
197191
void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
198192
bool AtTop,
199193
const RegPressureTracker &RPTracker,
@@ -1018,9 +1012,10 @@ bool GCNSchedStage::initGCNRegion() {
10181012
Unsched.reserve(DAG.NumRegionInstrs);
10191013
if (StageID == GCNSchedStageID::OccInitialSchedule ||
10201014
StageID == GCNSchedStageID::ILPInitialSchedule) {
1015+
const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
10211016
for (auto &I : DAG) {
10221017
Unsched.push_back(&I);
1023-
if (isIGLPMutationOnly(I.getOpcode()))
1018+
if (SII->isIGLPMutationOnly(I.getOpcode()))
10241019
DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
10251020
}
10261021
} else {
@@ -1754,8 +1749,9 @@ void GCNScheduleDAGMILive::updateRegionBoundaries(
17541749
}
17551750

17561751
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
1757-
return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
1758-
return isIGLPMutationOnly(MI->getOpcode());
1752+
const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
1753+
return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
1754+
return SII->isIGLPMutationOnly(MI->getOpcode());
17591755
});
17601756
}
17611757

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -986,6 +986,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
986986

987987
bool isIGLP(const MachineInstr &MI) const { return isIGLP(MI.getOpcode()); }
988988

989+
// Return true if the instruction is mutually exclusive with all non-IGLP DAG
990+
// mutations, requiring all other mutations to be disabled.
991+
bool isIGLPMutationOnly(unsigned Opcode) const {
992+
return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT;
993+
}
994+
989995
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
990996
switch (Opcode) {
991997
case AMDGPU::S_WAITCNT_soft:

llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 < %s | FileCheck %s
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-ilp < %s | FileCheck %s
24

35
; Test should not result in build failure
46
; CHECK-LABEL: shouldNotReApply

0 commit comments

Comments
 (0)