Skip to content

Commit d890dda

Browse files
authored
[NFCI][AMDGPU] Try to use PressureDiff to Calculate RegPressure. (#94221)
PressureDiff is reliable most of the time, and it's pretty much free compared to RPTracker. We can use it whenever there is no subregister definitions, or physregs invovled. No subregs because PDiff doesn't take into account lane liveness, and no Physreg because it seems to get PhysReg liveness completely wrong. Sometimes it adds a diff, sometimes itt doesn't - I didn't look at that one for long so maybe there is something we can eventually do to make it better. This allows us to save a ton of calls to RPTracker and LIS too. On a huge IR module (100+MB), it went from about 20M calls to RPTracker in this function down to 3.4, with the rest being PressureDiffs. I also added an expensive check to verify correctness of PressureDiff.
1 parent f2d215f commit d890dda

File tree

2 files changed

+113
-26
lines changed

2 files changed

+113
-26
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 108 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -116,31 +116,112 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
116116
<< ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
117117
}
118118

119+
/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
120+
/// current register pressure.
121+
///
122+
/// This works for the common case, but it has a few exceptions that have been
123+
/// observed through trial and error:
124+
/// - Explicit physical register operands
125+
/// - Subregister definitions
126+
///
127+
/// In both of those cases, PressureDiff doesn't represent the actual pressure,
128+
/// and querying LiveIntervals through the RegPressureTracker is needed to get
129+
/// an accurate value.
130+
///
131+
/// We should eventually only use PressureDiff for maximum performance, but this
132+
/// already allows 80% of SUs to take the fast path without changing scheduling
133+
/// at all. Further changes would either change scheduling, or require a lot
134+
/// more logic to recover an accurate pressure estimate from the PressureDiffs.
135+
static bool canUsePressureDiffs(const SUnit &SU) {
136+
if (!SU.isInstr())
137+
return false;
138+
139+
// Cannot use pressure diffs for subregister defs or with physregs, it's
140+
// imprecise in both cases.
141+
for (const auto &Op : SU.getInstr()->operands()) {
142+
if (!Op.isReg() || Op.isImplicit())
143+
continue;
144+
if (Op.getReg().isPhysical() ||
145+
(Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
146+
return false;
147+
}
148+
return true;
149+
}
150+
151+
static void getRegisterPressures(bool AtTop,
152+
const RegPressureTracker &RPTracker, SUnit *SU,
153+
std::vector<unsigned> &Pressure,
154+
std::vector<unsigned> &MaxPressure) {
155+
// getDownwardPressure() and getUpwardPressure() make temporary changes to
156+
// the tracker, so we need to pass those function a non-const copy.
157+
RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
158+
if (AtTop)
159+
TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
160+
else
161+
TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
162+
}
163+
119164
void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
120165
bool AtTop,
121166
const RegPressureTracker &RPTracker,
122167
const SIRegisterInfo *SRI,
123168
unsigned SGPRPressure,
124-
unsigned VGPRPressure) {
169+
unsigned VGPRPressure, bool IsBottomUp) {
125170
Cand.SU = SU;
126171
Cand.AtTop = AtTop;
127172

128173
if (!DAG->isTrackingPressure())
129174
return;
130175

131-
// getDownwardPressure() and getUpwardPressure() make temporary changes to
132-
// the tracker, so we need to pass those function a non-const copy.
133-
RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
134-
135176
Pressure.clear();
136177
MaxPressure.clear();
137178

138-
if (AtTop)
139-
TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
140-
else {
141-
// FIXME: I think for bottom up scheduling, the register pressure is cached
142-
// and can be retrieved by DAG->getPressureDif(SU).
143-
TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
179+
// We try to use the cached PressureDiffs in the ScheduleDAG whenever
180+
// possible over querying the RegPressureTracker.
181+
//
182+
// RegPressureTracker will make a lot of LIS queries which are very
183+
// expensive, it is considered a slow function in this context.
184+
//
185+
// PressureDiffs are precomputed and cached, and getPressureDiff is just a
186+
// trivial lookup into an array. It is pretty much free.
187+
//
188+
// In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
189+
// PressureDiffs.
190+
if (AtTop || !canUsePressureDiffs(*SU)) {
191+
getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure);
192+
} else {
193+
// Reserve 4 slots.
194+
Pressure.resize(4, 0);
195+
Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
196+
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
197+
198+
for (const auto &Diff : DAG->getPressureDiff(SU)) {
199+
if (!Diff.isValid())
200+
continue;
201+
// PressureDiffs is always bottom-up so if we're working top-down we need
202+
// to invert its sign.
203+
Pressure[Diff.getPSet()] +=
204+
(IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
205+
}
206+
207+
#ifdef EXPENSIVE_CHECKS
208+
std::vector<unsigned> CheckPressure, CheckMaxPressure;
209+
getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
210+
if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
211+
CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
212+
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
213+
CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
214+
errs() << "Register Pressure is inaccurate when calculated through "
215+
"PressureDiff\n"
216+
<< "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
217+
<< ", expected "
218+
<< CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
219+
<< "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
220+
<< ", expected "
221+
<< CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
222+
report_fatal_error("inaccurate register pressure calculation");
223+
}
224+
#endif
144225
}
145226

146227
unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
@@ -158,7 +239,6 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
158239
bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
159240
bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
160241

161-
162242
// FIXME: We have to enter REG-EXCESS before we reach the actual threshold
163243
// to increase the likelihood we don't go over the limits. We should improve
164244
// the analysis to look through dependencies to find the path with the least
@@ -207,7 +287,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
207287
void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
208288
const CandPolicy &ZonePolicy,
209289
const RegPressureTracker &RPTracker,
210-
SchedCandidate &Cand) {
290+
SchedCandidate &Cand,
291+
bool IsBottomUp) {
211292
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
212293
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
213294
unsigned SGPRPressure = 0;
@@ -220,8 +301,8 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
220301
for (SUnit *SU : Q) {
221302

222303
SchedCandidate TryCand(ZonePolicy);
223-
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
224-
SGPRPressure, VGPRPressure);
304+
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
305+
VGPRPressure, IsBottomUp);
225306
// Pass SchedBoundary only when comparing nodes from the same boundary.
226307
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
227308
tryCandidate(Cand, TryCand, ZoneArg);
@@ -262,15 +343,17 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
262343
if (!BotCand.isValid() || BotCand.SU->isScheduled ||
263344
BotCand.Policy != BotPolicy) {
264345
BotCand.reset(CandPolicy());
265-
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
346+
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
347+
/*IsBottomUp=*/true);
266348
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
267349
} else {
268350
LLVM_DEBUG(traceCandidate(BotCand));
269351
#ifndef NDEBUG
270352
if (VerifyScheduling) {
271353
SchedCandidate TCand;
272354
TCand.reset(CandPolicy());
273-
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
355+
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
356+
/*IsBottomUp=*/true);
274357
assert(TCand.SU == BotCand.SU &&
275358
"Last pick result should correspond to re-picking right now");
276359
}
@@ -282,15 +365,17 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
282365
if (!TopCand.isValid() || TopCand.SU->isScheduled ||
283366
TopCand.Policy != TopPolicy) {
284367
TopCand.reset(CandPolicy());
285-
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
368+
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
369+
/*IsBottomUp=*/false);
286370
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
287371
} else {
288372
LLVM_DEBUG(traceCandidate(TopCand));
289373
#ifndef NDEBUG
290374
if (VerifyScheduling) {
291375
SchedCandidate TCand;
292376
TCand.reset(CandPolicy());
293-
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
377+
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
378+
/*IsBottomUp=*/false);
294379
assert(TCand.SU == TopCand.SU &&
295380
"Last pick result should correspond to re-picking right now");
296381
}
@@ -327,7 +412,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
327412
if (!SU) {
328413
CandPolicy NoPolicy;
329414
TopCand.reset(NoPolicy);
330-
pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
415+
pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
416+
/*IsBottomUp=*/false);
331417
assert(TopCand.Reason != NoCand && "failed to find a candidate");
332418
SU = TopCand.SU;
333419
}
@@ -337,7 +423,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
337423
if (!SU) {
338424
CandPolicy NoPolicy;
339425
BotCand.reset(NoPolicy);
340-
pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
426+
pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
427+
/*IsBottomUp=*/true);
341428
assert(BotCand.Reason != NoCand && "failed to find a candidate");
342429
SU = BotCand.SU;
343430
}

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,12 @@ class GCNSchedStrategy : public GenericScheduler {
4545

4646
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
4747
const RegPressureTracker &RPTracker,
48-
SchedCandidate &Cand);
48+
SchedCandidate &Cand, bool IsBottomUp);
4949

50-
void initCandidate(SchedCandidate &Cand, SUnit *SU,
51-
bool AtTop, const RegPressureTracker &RPTracker,
52-
const SIRegisterInfo *SRI,
53-
unsigned SGPRPressure, unsigned VGPRPressure);
50+
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
51+
const RegPressureTracker &RPTracker,
52+
const SIRegisterInfo *SRI, unsigned SGPRPressure,
53+
unsigned VGPRPressure, bool IsBottomUp);
5454

5555
std::vector<unsigned> Pressure;
5656

0 commit comments

Comments
 (0)