Skip to content

Commit 0b21b25

Browse files
authored
[AMDGPU] Do not optimize away pre-existing waitcnt instructions at -O0 (#90716)
The autogenerated memory legalizer tests use -O0 so this allows us to see the exact waitcnts that were inserted by the memory legalizer without them being optimized away.
1 parent fdf206c commit 0b21b25

28 files changed

+168134
-77300
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -448,12 +448,19 @@ class WaitcntGenerator {
448448
const SIInstrInfo *TII = nullptr;
449449
AMDGPU::IsaVersion IV;
450450
InstCounterType MaxCounter;
451+
bool OptNone;
451452

452453
public:
453454
WaitcntGenerator() {}
454-
WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
455-
: ST(ST), TII(ST->getInstrInfo()),
456-
IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
455+
WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
456+
: ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
457+
IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
458+
OptNone(MF.getFunction().hasOptNone() ||
459+
MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
460+
461+
// Return true if the current function should be compiled with no
462+
// optimization.
463+
bool isOptNone() const { return OptNone; }
457464

458465
// Edits an existing sequence of wait count instructions according
459466
// to an incoming Waitcnt value, which is itself updated to reflect
@@ -504,8 +511,8 @@ class WaitcntGenerator {
504511
class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
505512
public:
506513
WaitcntGeneratorPreGFX12() {}
507-
WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
508-
: WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
514+
WaitcntGeneratorPreGFX12(const MachineFunction &MF)
515+
: WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
509516

510517
bool
511518
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -539,8 +546,9 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
539546
class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
540547
public:
541548
WaitcntGeneratorGFX12Plus() {}
542-
WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
543-
: WaitcntGenerator(ST, MaxCounter) {}
549+
WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
550+
InstCounterType MaxCounter)
551+
: WaitcntGenerator(MF, MaxCounter) {}
544552

545553
bool
546554
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -597,8 +605,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
597605
bool ForceEmitZeroWaitcnts;
598606
bool ForceEmitWaitcnt[NUM_INST_CNTS];
599607

600-
bool OptNone;
601-
602608
// In any given run of this pass, WCG will point to one of these two
603609
// generator objects, which must have been re-initialised before use
604610
// from a value made using a subtarget constructor.
@@ -1203,19 +1209,19 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
12031209
continue;
12041210

12051211
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1206-
bool IsSoft = Opcode != II.getOpcode();
1212+
bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
12071213

12081214
// Update required wait count. If this is a soft waitcnt (= it was added
12091215
// by an earlier pass), it may be entirely removed.
12101216
if (Opcode == AMDGPU::S_WAITCNT) {
12111217
unsigned IEnc = II.getOperand(0).getImm();
12121218
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1213-
if (IsSoft)
1219+
if (TrySimplify)
12141220
ScoreBrackets.simplifyWaitcnt(OldWait);
12151221
Wait = Wait.combined(OldWait);
12161222

12171223
// Merge consecutive waitcnt of the same type by erasing multiples.
1218-
if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
1224+
if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
12191225
II.eraseFromParent();
12201226
Modified = true;
12211227
} else
@@ -1226,11 +1232,11 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
12261232

12271233
unsigned OldVSCnt =
12281234
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1229-
if (IsSoft)
1235+
if (TrySimplify)
12301236
ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
12311237
Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
12321238

1233-
if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
1239+
if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
12341240
II.eraseFromParent();
12351241
Modified = true;
12361242
} else
@@ -1356,21 +1362,21 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
13561362
// by an earlier pass), it may be entirely removed.
13571363

13581364
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1359-
bool IsSoft = Opcode != II.getOpcode();
1365+
bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
13601366

13611367
if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
13621368
unsigned OldEnc =
13631369
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
13641370
AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1365-
if (IsSoft)
1371+
if (TrySimplify)
13661372
ScoreBrackets.simplifyWaitcnt(OldWait);
13671373
Wait = Wait.combined(OldWait);
13681374
UpdatableInstr = &CombinedLoadDsCntInstr;
13691375
} else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
13701376
unsigned OldEnc =
13711377
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
13721378
AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1373-
if (IsSoft)
1379+
if (TrySimplify)
13741380
ScoreBrackets.simplifyWaitcnt(OldWait);
13751381
Wait = Wait.combined(OldWait);
13761382
UpdatableInstr = &CombinedStoreDsCntInstr;
@@ -1379,7 +1385,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
13791385
assert(CT.has_value());
13801386
unsigned OldCnt =
13811387
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1382-
if (IsSoft)
1388+
if (TrySimplify)
13831389
ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
13841390
addWait(Wait, CT.value(), OldCnt);
13851391
UpdatableInstr = &WaitInstrs[CT.value()];
@@ -1649,7 +1655,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16491655
// * we are not in Dynamic VGPR mode
16501656
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
16511657
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1652-
if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
1658+
if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
16531659
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
16541660
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
16551661
ReleaseVGPRInsts.insert(&MI);
@@ -2471,11 +2477,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
24712477

24722478
if (ST->hasExtendedWaitCounts()) {
24732479
MaxCounter = NUM_EXTENDED_INST_CNTS;
2474-
WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
2480+
WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
24752481
WCG = &WCGGFX12Plus;
24762482
} else {
24772483
MaxCounter = NUM_NORMAL_INST_CNTS;
2478-
WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
2484+
WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
24792485
WCG = &WCGPreGFX12;
24802486
}
24812487

@@ -2487,9 +2493,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
24872493

24882494
SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
24892495

2490-
OptNone = MF.getFunction().hasOptNone() ||
2491-
MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
2492-
24932496
HardwareLimits Limits = {};
24942497
if (ST->hasExtendedWaitCounts()) {
24952498
Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);

0 commit comments

Comments
 (0)