Skip to content

Commit c96b4d1

Browse files
committed
[AMDGPU][SIInsertWaitcnts] Do not add s_waitcnt when the counters are known to be 0 already
1 parent a8913f8 commit c96b4d1

File tree

65 files changed

+570
-10626
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+570
-10626
lines changed

llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,12 @@ void AMDGPUInstrPostProcess::postProcessInstruction(
2525
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
2626
switch (MCI.getOpcode()) {
2727
case AMDGPU::S_WAITCNT:
28+
case AMDGPU::S_WAITCNT_soft:
2829
case AMDGPU::S_WAITCNT_EXPCNT:
2930
case AMDGPU::S_WAITCNT_LGKMCNT:
3031
case AMDGPU::S_WAITCNT_VMCNT:
3132
case AMDGPU::S_WAITCNT_VSCNT:
33+
case AMDGPU::S_WAITCNT_VSCNT_soft:
3234
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
3335
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
3436
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
@@ -77,10 +79,12 @@ unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
7779
default:
7880
return 0;
7981
case AMDGPU::S_WAITCNT: // This instruction
82+
case AMDGPU::S_WAITCNT_soft:
8083
case AMDGPU::S_WAITCNT_EXPCNT:
8184
case AMDGPU::S_WAITCNT_LGKMCNT:
8285
case AMDGPU::S_WAITCNT_VMCNT:
83-
case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
86+
case AMDGPU::S_WAITCNT_VSCNT:
87+
case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
8488
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
8589
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
8690
case AMDGPU::S_WAITCNT_VMCNT_gfx10:

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 73 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,13 @@ class WaitcntBrackets {
292292
VgprVmemTypes[GprNo] = 0;
293293
}
294294

295+
void setNonKernelFunctionInitialState() {
296+
for (InstCounterType Counter : inst_counter_types()) {
297+
setScoreUB(Counter, getWaitCountMax(Counter));
298+
PendingEvents |= WaitEventMaskForInst[Counter];
299+
}
300+
}
301+
295302
void print(raw_ostream &);
296303
void dump() { print(dbgs()); }
297304

@@ -364,7 +371,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
364371
const MachineRegisterInfo *MRI = nullptr;
365372
AMDGPU::IsaVersion IV;
366373

367-
DenseSet<MachineInstr *> TrackedWaitcntSet;
368374
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
369375
DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
370376
MachineLoopInfo *MLI;
@@ -486,6 +492,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
486492
MachineInstr &OldWaitcntInstr,
487493
AMDGPU::Waitcnt &Wait,
488494
MachineBasicBlock::instr_iterator It) const;
495+
bool updateWaitcntIfSoft(MachineInstr *Waitcnt) const;
489496
};
490497

491498
} // end anonymous namespace
@@ -870,6 +877,15 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
870877
return true;
871878
}
872879

880+
bool SIInsertWaitcnts::updateWaitcntIfSoft(MachineInstr *Waitcnt) const {
881+
unsigned Opcode = Waitcnt->getOpcode();
882+
if (!SIInstrInfo::isSoftWaitcnt(Opcode))
883+
return false;
884+
885+
Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode)));
886+
return true;
887+
}
888+
873889
/// Combine consecutive waitcnt instructions that precede \p It and follow
874890
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
875891
/// by previous passes. Currently this pass conservatively assumes that these
@@ -886,34 +902,40 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
886902
if (II.isMetaInstruction())
887903
continue;
888904

889-
if (II.getOpcode() == AMDGPU::S_WAITCNT) {
905+
unsigned Opcode = II.getOpcode();
906+
bool CanFullyDiscardWaitcntSequence = SIInstrInfo::isSoftWaitcnt(Opcode);
907+
908+
if (SIInstrInfo::isWaitcnt(Opcode)) {
890909
// Conservatively update required wait if this waitcnt was added in an
891910
// earlier pass. In this case it will not exist in the tracked waitcnt
892911
// set.
893-
if (!TrackedWaitcntSet.count(&II)) {
894-
unsigned IEnc = II.getOperand(0).getImm();
895-
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
896-
Wait = Wait.combined(OldWait);
897-
}
912+
unsigned IEnc = II.getOperand(0).getImm();
913+
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
914+
if (CanFullyDiscardWaitcntSequence)
915+
ScoreBrackets.simplifyWaitcnt(OldWait);
916+
Wait = Wait.combined(OldWait);
898917

899918
// Merge consecutive waitcnt of the same type by erasing multiples.
900-
if (!WaitcntInstr) {
919+
if (!WaitcntInstr &&
920+
(Wait.hasWaitExceptVsCnt() || !CanFullyDiscardWaitcntSequence)) {
901921
WaitcntInstr = &II;
902922
} else {
903923
II.eraseFromParent();
904924
Modified = true;
905925
}
906926

907927
} else {
908-
assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
928+
assert(SIInstrInfo::isWaitcntVsCnt(Opcode));
909929
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
910-
if (!TrackedWaitcntSet.count(&II)) {
911-
unsigned OldVSCnt =
912-
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
913-
Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
914-
}
915930

916-
if (!WaitcntVsCntInstr) {
931+
unsigned OldVSCnt =
932+
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
933+
if (CanFullyDiscardWaitcntSequence)
934+
ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt);
935+
Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
936+
937+
if (!WaitcntVsCntInstr &&
938+
(Wait.hasWaitVsCnt() || !CanFullyDiscardWaitcntSequence)) {
917939
WaitcntVsCntInstr = &II;
918940
} else {
919941
II.eraseFromParent();
@@ -924,48 +946,38 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
924946

925947
// Updated encoding of merged waitcnt with the required wait.
926948
if (WaitcntInstr) {
927-
if (Wait.hasWaitExceptVsCnt()) {
928-
Modified |=
929-
updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
930-
AMDGPU::encodeWaitcnt(IV, Wait));
931-
ScoreBrackets.applyWaitcnt(Wait);
932-
Wait.VmCnt = ~0u;
933-
Wait.LgkmCnt = ~0u;
934-
Wait.ExpCnt = ~0u;
935-
936-
LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
937-
? dbgs() << "applyPreexistingWaitcnt\n"
938-
<< "New Instr at block end: " << *WaitcntInstr
939-
<< '\n'
940-
: dbgs() << "applyPreexistingWaitcnt\n"
941-
<< "Old Instr: " << *It
942-
<< "New Instr: " << *WaitcntInstr << '\n');
949+
Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
950+
AMDGPU::encodeWaitcnt(IV, Wait));
951+
Modified |= updateWaitcntIfSoft(WaitcntInstr);
943952

944-
} else {
945-
WaitcntInstr->eraseFromParent();
946-
Modified = true;
947-
}
953+
ScoreBrackets.applyWaitcnt(Wait);
954+
Wait.VmCnt = ~0u;
955+
Wait.LgkmCnt = ~0u;
956+
Wait.ExpCnt = ~0u;
957+
958+
LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
959+
? dbgs()
960+
<< "applyPreexistingWaitcnt\n"
961+
<< "New Instr at block end: " << *WaitcntInstr << '\n'
962+
: dbgs() << "applyPreexistingWaitcnt\n"
963+
<< "Old Instr: " << *It
964+
<< "New Instr: " << *WaitcntInstr << '\n');
948965
}
949966

950967
if (WaitcntVsCntInstr) {
951-
if (Wait.hasWaitVsCnt()) {
952-
assert(ST->hasVscnt());
953-
Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
954-
AMDGPU::OpName::simm16, Wait.VsCnt);
955-
ScoreBrackets.applyWaitcnt(Wait);
956-
Wait.VsCnt = ~0u;
957-
958-
LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
959-
? dbgs() << "applyPreexistingWaitcnt\n"
960-
<< "New Instr at block end: "
961-
<< *WaitcntVsCntInstr << '\n'
962-
: dbgs() << "applyPreexistingWaitcnt\n"
963-
<< "Old Instr: " << *It
964-
<< "New Instr: " << *WaitcntVsCntInstr << '\n');
965-
} else {
966-
WaitcntVsCntInstr->eraseFromParent();
967-
Modified = true;
968-
}
968+
Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
969+
AMDGPU::OpName::simm16, Wait.VsCnt);
970+
Modified |= updateWaitcntIfSoft(WaitcntVsCntInstr);
971+
ScoreBrackets.applyWaitcnt(Wait);
972+
Wait.VsCnt = ~0u;
973+
974+
LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
975+
? dbgs() << "applyPreexistingWaitcnt\n"
976+
<< "New Instr at block end: " << *WaitcntVsCntInstr
977+
<< '\n'
978+
: dbgs() << "applyPreexistingWaitcnt\n"
979+
<< "Old Instr: " << *It
980+
<< "New Instr: " << *WaitcntVsCntInstr << '\n');
969981
}
970982

971983
return Modified;
@@ -1317,7 +1329,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
13171329
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
13181330
auto SWaitInst =
13191331
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1320-
TrackedWaitcntSet.insert(SWaitInst);
13211332
Modified = true;
13221333

13231334
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1331,7 +1342,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
13311342
auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
13321343
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
13331344
.addImm(Wait.VsCnt);
1334-
TrackedWaitcntSet.insert(SWaitInst);
13351345
Modified = true;
13361346

13371347
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1574,9 +1584,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
15741584
}
15751585

15761586
static bool isWaitInstr(MachineInstr &Inst) {
1577-
return Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1578-
(Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1579-
Inst.getOperand(0).isReg() &&
1587+
auto Opcode = Inst.getOpcode();
1588+
return SIInstrInfo::isWaitcnt(Opcode) ||
1589+
(SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() &&
15801590
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
15811591
}
15821592

@@ -1845,7 +1855,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
18451855
TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
18461856
Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
18471857

1848-
TrackedWaitcntSet.clear();
18491858
BlockInfos.clear();
18501859
bool Modified = false;
18511860

@@ -1863,6 +1872,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
18631872
;
18641873
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
18651874

1875+
auto NonKernelInitialState =
1876+
std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
1877+
NonKernelInitialState->setNonKernelFunctionInitialState();
1878+
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
1879+
18661880
Modified = true;
18671881
}
18681882

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8593,6 +8593,11 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
85938593
}
85948594

85958595
int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
8596+
8597+
// FIXME: move to the right place
8598+
if (SIInstrInfo::isSoftWaitcnt(Opcode))
8599+
Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
8600+
85968601
unsigned Gen = subtargetEncodingFamily(ST);
85978602

85988603
if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,31 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
845845
return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead;
846846
}
847847

848+
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
849+
if (isWaitcnt(Opcode))
850+
return AMDGPU::S_WAITCNT;
851+
852+
if (isWaitcntVsCnt(Opcode))
853+
return AMDGPU::S_WAITCNT_VSCNT;
854+
855+
llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT");
856+
}
857+
858+
static bool isWaitcnt(unsigned Opcode) {
859+
return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft;
860+
}
861+
862+
static bool isWaitcntVsCnt(unsigned Opcode) {
863+
return Opcode == AMDGPU::S_WAITCNT_VSCNT ||
864+
Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
865+
}
866+
867+
// soft waitcnt instructions can be relaxed/optimized out by SIInsertWaitcnts
868+
static bool isSoftWaitcnt(unsigned Opcode) {
869+
return Opcode == AMDGPU::S_WAITCNT_soft ||
870+
Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
871+
}
872+
848873
bool isVGPRCopy(const MachineInstr &MI) const {
849874
assert(isCopyInstr(MI));
850875
Register Dest = MI.getOperand(0).getReg();

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,7 +1055,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
10551055
VMCnt ? 0 : getVmcntBitMask(IV),
10561056
getExpcntBitMask(IV),
10571057
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1058-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1058+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1059+
.addImm(WaitCntImmediate);
10591060
Changed = true;
10601061
}
10611062

@@ -1963,14 +1964,15 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
19631964
VMCnt ? 0 : getVmcntBitMask(IV),
19641965
getExpcntBitMask(IV),
19651966
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1966-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1967+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1968+
.addImm(WaitCntImmediate);
19671969
Changed = true;
19681970
}
19691971

19701972
if (VSCnt) {
1971-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1972-
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1973-
.addImm(0);
1973+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
1974+
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1975+
.addImm(0);
19741976
Changed = true;
19751977
}
19761978

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1459,6 +1459,16 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
14591459

14601460
def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16",
14611461
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
1462+
1463+
// "_soft" waitcnts are waitcnts whose wait can be relaxed or completely removed.
1464+
// These are inserted by to resolve memory dependencies by the memory legalizer and later optimized by SIInsertWaitcnts
1465+
// For example, a S_WAITCNT_soft 0 can be completely removed on a function that doesn't access memory.
1466+
def S_WAITCNT_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">;
1467+
def S_WAITCNT_VSCNT_soft : SOPP_Pseudo<"s_soft_waitcnt_vscnt", (ins SReg_32:$sdst, s16imm:$simm16), "$sdst, $simm16"> {
1468+
let mayLoad = 1;
1469+
let mayStore = 1;
1470+
let has_sdst = 1;
1471+
}
14621472
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
14631473
[(int_amdgcn_s_sethalt timm:$simm16)]>;
14641474
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;

0 commit comments

Comments
 (0)