Skip to content

[AMDGPU][SIInsertWaitcnts] Do not add s_waitcnt when the counters are known to be 0 already #65735

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@ void AMDGPUInstrPostProcess::postProcessInstruction(
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
switch (MCI.getOpcode()) {
case AMDGPU::S_WAITCNT:
case AMDGPU::S_WAITCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT:
case AMDGPU::S_WAITCNT_LGKMCNT:
case AMDGPU::S_WAITCNT_VMCNT:
case AMDGPU::S_WAITCNT_VSCNT:
case AMDGPU::S_WAITCNT_VSCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
Expand Down Expand Up @@ -77,10 +79,12 @@ unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
default:
return 0;
case AMDGPU::S_WAITCNT: // This instruction
case AMDGPU::S_WAITCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT:
case AMDGPU::S_WAITCNT_LGKMCNT:
case AMDGPU::S_WAITCNT_VMCNT:
case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
case AMDGPU::S_WAITCNT_VSCNT:
case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
Expand Down
136 changes: 75 additions & 61 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,13 @@ class WaitcntBrackets {
VgprVmemTypes[GprNo] = 0;
}

void setNonKernelFunctionInitialState() {
for (InstCounterType Counter : inst_counter_types()) {
setScoreUB(Counter, getWaitCountMax(Counter));
PendingEvents |= WaitEventMaskForInst[Counter];
}
}

void print(raw_ostream &);
void dump() { print(dbgs()); }

Expand Down Expand Up @@ -364,7 +371,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
const MachineRegisterInfo *MRI = nullptr;
AMDGPU::IsaVersion IV;

DenseSet<MachineInstr *> TrackedWaitcntSet;
Copy link
Contributor

@jayfoad jayfoad Sep 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is my understanding correct? Previously we used TrackedWaitcntSet to distinguish pre-existing waitcnts from waitcnts inserted by this pass. Now we don't need that, all we need is to know the difference between hard and soft waitcnts?

DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
MachineLoopInfo *MLI;
Expand Down Expand Up @@ -477,7 +483,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr);
MachineInstr *OldWaitcntInstr) const;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like an unrelated clean up? Please commit it separately.

void updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
Expand All @@ -486,6 +492,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
MachineInstr &OldWaitcntInstr,
AMDGPU::Waitcnt &Wait,
MachineBasicBlock::instr_iterator It) const;
bool updateWaitcntIfSoft(MachineInstr *Waitcnt) const;
};

} // end anonymous namespace
Expand Down Expand Up @@ -870,6 +877,15 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
return true;
}

bool SIInsertWaitcnts::updateWaitcntIfSoft(MachineInstr *Waitcnt) const {
unsigned Opcode = Waitcnt->getOpcode();
if (!SIInstrInfo::isSoftWaitcnt(Opcode))
return false;

Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode)));
return true;
}

/// Combine consecutive waitcnt instructions that precede \p It and follow
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
/// by previous passes. Currently this pass conservatively assumes that these
Expand All @@ -886,34 +902,40 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
if (II.isMetaInstruction())
continue;

if (II.getOpcode() == AMDGPU::S_WAITCNT) {
unsigned Opcode = II.getOpcode();
bool CanFullyDiscardWaitcntSequence = SIInstrInfo::isSoftWaitcnt(Opcode);

if (SIInstrInfo::isWaitcnt(Opcode)) {
// Conservatively update required wait if this waitcnt was added in an
// earlier pass. In this case it will not exist in the tracked waitcnt
// set.
if (!TrackedWaitcntSet.count(&II)) {
unsigned IEnc = II.getOperand(0).getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
Wait = Wait.combined(OldWait);
}
unsigned IEnc = II.getOperand(0).getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
if (CanFullyDiscardWaitcntSequence)
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);

// Merge consecutive waitcnt of the same type by erasing multiples.
if (!WaitcntInstr) {
if (!WaitcntInstr &&
(Wait.hasWaitExceptVsCnt() || !CanFullyDiscardWaitcntSequence)) {
WaitcntInstr = &II;
} else {
II.eraseFromParent();
Modified = true;
}

} else {
assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
assert(SIInstrInfo::isWaitcntVsCnt(Opcode));
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
if (!TrackedWaitcntSet.count(&II)) {
unsigned OldVSCnt =
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
}

if (!WaitcntVsCntInstr) {
unsigned OldVSCnt =
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
if (CanFullyDiscardWaitcntSequence)
ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt);
Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);

if (!WaitcntVsCntInstr &&
(Wait.hasWaitVsCnt() || !CanFullyDiscardWaitcntSequence)) {
WaitcntVsCntInstr = &II;
} else {
II.eraseFromParent();
Expand All @@ -924,48 +946,38 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(

// Updated encoding of merged waitcnt with the required wait.
if (WaitcntInstr) {
if (Wait.hasWaitExceptVsCnt()) {
Modified |=
updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
AMDGPU::encodeWaitcnt(IV, Wait));
ScoreBrackets.applyWaitcnt(Wait);
Wait.VmCnt = ~0u;
Wait.LgkmCnt = ~0u;
Wait.ExpCnt = ~0u;

LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
? dbgs() << "applyPreexistingWaitcnt\n"
<< "New Instr at block end: " << *WaitcntInstr
<< '\n'
: dbgs() << "applyPreexistingWaitcnt\n"
<< "Old Instr: " << *It
<< "New Instr: " << *WaitcntInstr << '\n');
Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
AMDGPU::encodeWaitcnt(IV, Wait));
Modified |= updateWaitcntIfSoft(WaitcntInstr);

} else {
WaitcntInstr->eraseFromParent();
Modified = true;
}
ScoreBrackets.applyWaitcnt(Wait);
Wait.VmCnt = ~0u;
Wait.LgkmCnt = ~0u;
Wait.ExpCnt = ~0u;

LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
? dbgs()
<< "applyPreexistingWaitcnt\n"
<< "New Instr at block end: " << *WaitcntInstr << '\n'
: dbgs() << "applyPreexistingWaitcnt\n"
<< "Old Instr: " << *It
<< "New Instr: " << *WaitcntInstr << '\n');
}

if (WaitcntVsCntInstr) {
if (Wait.hasWaitVsCnt()) {
assert(ST->hasVscnt());
Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
AMDGPU::OpName::simm16, Wait.VsCnt);
ScoreBrackets.applyWaitcnt(Wait);
Wait.VsCnt = ~0u;

LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
? dbgs() << "applyPreexistingWaitcnt\n"
<< "New Instr at block end: "
<< *WaitcntVsCntInstr << '\n'
: dbgs() << "applyPreexistingWaitcnt\n"
<< "Old Instr: " << *It
<< "New Instr: " << *WaitcntVsCntInstr << '\n');
} else {
WaitcntVsCntInstr->eraseFromParent();
Modified = true;
}
Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
AMDGPU::OpName::simm16, Wait.VsCnt);
Modified |= updateWaitcntIfSoft(WaitcntVsCntInstr);
ScoreBrackets.applyWaitcnt(Wait);
Wait.VsCnt = ~0u;

LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
? dbgs() << "applyPreexistingWaitcnt\n"
<< "New Instr at block end: " << *WaitcntVsCntInstr
<< '\n'
: dbgs() << "applyPreexistingWaitcnt\n"
<< "Old Instr: " << *It
<< "New Instr: " << *WaitcntVsCntInstr << '\n');
}

return Modified;
Expand Down Expand Up @@ -1284,7 +1296,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr) {
MachineInstr *OldWaitcntInstr) const {
bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);

Expand Down Expand Up @@ -1317,7 +1329,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
auto SWaitInst =
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
TrackedWaitcntSet.insert(SWaitInst);
Modified = true;

LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
Expand All @@ -1331,7 +1342,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(Wait.VsCnt);
TrackedWaitcntSet.insert(SWaitInst);
Modified = true;

LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
Expand Down Expand Up @@ -1574,9 +1584,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
}

static bool isWaitInstr(MachineInstr &Inst) {
return Inst.getOpcode() == AMDGPU::S_WAITCNT ||
(Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
Inst.getOperand(0).isReg() &&
auto Opcode = Inst.getOpcode();
return SIInstrInfo::isWaitcnt(Opcode) ||
(SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() &&
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
}

Expand Down Expand Up @@ -1845,7 +1855,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;

TrackedWaitcntSet.clear();
BlockInfos.clear();
bool Modified = false;

Expand All @@ -1863,6 +1872,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
;
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);

auto NonKernelInitialState =
std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
NonKernelInitialState->setNonKernelFunctionInitialState();
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);

Modified = true;
}

Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8593,6 +8593,11 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
}

int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {

// FIXME: move to the right place
if (SIInstrInfo::isSoftWaitcnt(Opcode))
Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);

unsigned Gen = subtargetEncodingFamily(ST);

if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
Expand Down
25 changes: 25 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,31 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead;
}

static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
if (isWaitcnt(Opcode))
return AMDGPU::S_WAITCNT;

if (isWaitcntVsCnt(Opcode))
return AMDGPU::S_WAITCNT_VSCNT;

llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT");
}

static bool isWaitcnt(unsigned Opcode) {
return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft;
}

static bool isWaitcntVsCnt(unsigned Opcode) {
return Opcode == AMDGPU::S_WAITCNT_VSCNT ||
Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
}

// soft waitcnt instructions can be relaxed/optimized out by SIInsertWaitcnts
static bool isSoftWaitcnt(unsigned Opcode) {
return Opcode == AMDGPU::S_WAITCNT_soft ||
Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
}

bool isVGPRCopy(const MachineInstr &MI) const {
assert(isCopyInstr(MI));
Register Dest = MI.getOperand(0).getReg();
Expand Down
24 changes: 19 additions & 5 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ static cl::opt<bool> AmdgcnSkipCacheInvalidations(
"amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
cl::desc("Use this to skip inserting cache invalidating instructions."));

static cl::opt<bool> AmdgcnDisableSoftWaitcnt(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of this I suggest you change memory-legalizer-atomic-fence.ll to use -stop-after=si-memory-legalizer (and generate the checks in it with utils/update_mir_test_checks.py).

"amdgcn-disable-soft-waitcnt", cl::init(false), cl::Hidden,
cl::desc("Use this option to disable 'soft' waitcnt instructions in the "
"memory-legalizer."));

namespace {

LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
Expand Down Expand Up @@ -271,6 +276,10 @@ class SICacheControl {
/// Whether to insert cache invalidating instructions.
bool InsertCacheInv;

/// Either regular or soft waitcnt opcode.
unsigned WAITCNT_Opcode;
unsigned WAITCNT_VSCNT_Opcode;

SICacheControl(const GCNSubtarget &ST);

/// Sets named bit \p BitName to "true" if present in instruction \p MI.
Expand Down Expand Up @@ -832,6 +841,11 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
TII = ST.getInstrInfo();
IV = getIsaVersion(ST.getCPU());
InsertCacheInv = !AmdgcnSkipCacheInvalidations;
WAITCNT_Opcode =
AmdgcnDisableSoftWaitcnt ? AMDGPU::S_WAITCNT : AMDGPU::S_WAITCNT_soft;
WAITCNT_VSCNT_Opcode = AmdgcnDisableSoftWaitcnt
? AMDGPU::S_WAITCNT_VSCNT
: AMDGPU::S_WAITCNT_VSCNT_soft;
}

bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
Expand Down Expand Up @@ -1055,7 +1069,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
VMCnt ? 0 : getVmcntBitMask(IV),
getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
BuildMI(MBB, MI, DL, TII->get(WAITCNT_Opcode)).addImm(WaitCntImmediate);
Changed = true;
}

Expand Down Expand Up @@ -1963,14 +1977,14 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
VMCnt ? 0 : getVmcntBitMask(IV),
getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
BuildMI(MBB, MI, DL, TII->get(WAITCNT_Opcode)).addImm(WaitCntImmediate);
Changed = true;
}

if (VSCnt) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(0);
BuildMI(MBB, MI, DL, TII->get(WAITCNT_VSCNT_Opcode))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(0);
Changed = true;
}

Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1459,6 +1459,16 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {

def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16",
[(int_amdgcn_s_waitcnt timm:$simm16)]>;

// "_soft" waitcnts are waitcnts whose wait can be relaxed or completely removed.
// These are inserted by to resolve memory dependencies by the memory legalizer and later optimized by SIInsertWaitcnts
// For example, a S_WAITCNT_soft 0 can be completely removed on a function that doesn't access memory.
def S_WAITCNT_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">;
def S_WAITCNT_VSCNT_soft : SOPP_Pseudo<"s_soft_waitcnt_vscnt", (ins SReg_32:$sdst, s16imm:$simm16), "$sdst, $simm16"> {
let mayLoad = 1;
let mayStore = 1;
let has_sdst = 1;
Comment on lines +1468 to +1470
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are these needed? The definition of S_WAITCNT above does not have them.

}
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
Expand Down
Loading