Skip to content

Commit cdcc791

Browse files
jmmartinezPierre-vh
authored andcommitted
[AMDGPU][SIInsertWaitcnts] Do not add s_waitcnt when the counters are known to be 0 already
1 parent f1ea77f commit cdcc791

File tree

64 files changed

+957
-10908
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+957
-10908
lines changed

llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,12 @@ void AMDGPUInstrPostProcess::postProcessInstruction(
2525
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
2626
switch (MCI.getOpcode()) {
2727
case AMDGPU::S_WAITCNT:
28+
case AMDGPU::S_WAITCNT_soft:
2829
case AMDGPU::S_WAITCNT_EXPCNT:
2930
case AMDGPU::S_WAITCNT_LGKMCNT:
3031
case AMDGPU::S_WAITCNT_VMCNT:
3132
case AMDGPU::S_WAITCNT_VSCNT:
33+
case AMDGPU::S_WAITCNT_VSCNT_soft:
3234
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
3335
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
3436
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
@@ -77,10 +79,12 @@ unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
7779
default:
7880
return 0;
7981
case AMDGPU::S_WAITCNT: // This instruction
82+
case AMDGPU::S_WAITCNT_soft:
8083
case AMDGPU::S_WAITCNT_EXPCNT:
8184
case AMDGPU::S_WAITCNT_LGKMCNT:
8285
case AMDGPU::S_WAITCNT_VMCNT:
83-
case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
86+
case AMDGPU::S_WAITCNT_VSCNT:
87+
case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
8488
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
8589
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
8690
case AMDGPU::S_WAITCNT_VMCNT_gfx10:

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 67 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
369369
const MachineRegisterInfo *MRI = nullptr;
370370
AMDGPU::IsaVersion IV;
371371

372-
DenseSet<MachineInstr *> TrackedWaitcntSet;
373372
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
374373
DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
375374
MachineLoopInfo *MLI;
@@ -493,6 +492,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
493492
MachineInstr &OldWaitcntInstr,
494493
AMDGPU::Waitcnt &Wait,
495494
MachineBasicBlock::instr_iterator It) const;
495+
496+
// Transform a soft waitcnt into a normal one.
497+
bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
496498
};
497499

498500
} // end anonymous namespace
@@ -872,6 +874,15 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
872874
return true;
873875
}
874876

877+
bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
878+
unsigned Opcode = Waitcnt->getOpcode();
879+
if (!SIInstrInfo::isSoftWaitcnt(Opcode))
880+
return false;
881+
882+
Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode)));
883+
return true;
884+
}
885+
875886
/// Combine consecutive waitcnt instructions that precede \p It and follow
876887
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
877888
/// by previous passes. Currently this pass conservatively assumes that these
@@ -888,86 +899,77 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
888899
if (II.isMetaInstruction())
889900
continue;
890901

891-
if (II.getOpcode() == AMDGPU::S_WAITCNT) {
892-
// Conservatively update required wait if this waitcnt was added in an
893-
// earlier pass. In this case it will not exist in the tracked waitcnt
894-
// set.
895-
if (!TrackedWaitcntSet.count(&II)) {
896-
unsigned IEnc = II.getOperand(0).getImm();
897-
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
898-
Wait = Wait.combined(OldWait);
899-
}
902+
unsigned Opcode = II.getOpcode();
903+
bool IsSoft = SIInstrInfo::isSoftWaitcnt(Opcode);
904+
905+
if (SIInstrInfo::isWaitcnt(Opcode)) {
906+
// Update required wait count. If this is a soft waitcnt (= it was added
907+
// by an earlier pass), it may be entirely removed.
908+
unsigned IEnc = II.getOperand(0).getImm();
909+
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
910+
if (IsSoft)
911+
ScoreBrackets.simplifyWaitcnt(OldWait);
912+
Wait = Wait.combined(OldWait);
900913

901914
// Merge consecutive waitcnt of the same type by erasing multiples.
902-
if (!WaitcntInstr) {
903-
WaitcntInstr = &II;
904-
} else {
915+
if (WaitcntInstr || (!Wait.hasWaitExceptVsCnt() && IsSoft)) {
905916
II.eraseFromParent();
906917
Modified = true;
907-
}
918+
} else
919+
WaitcntInstr = &II;
908920

909921
} else {
910-
assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
922+
assert(SIInstrInfo::isWaitcntVsCnt(Opcode));
911923
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
912-
if (!TrackedWaitcntSet.count(&II)) {
913-
unsigned OldVSCnt =
914-
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
915-
Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
916-
}
917924

918-
if (!WaitcntVsCntInstr) {
919-
WaitcntVsCntInstr = &II;
920-
} else {
925+
unsigned OldVSCnt =
926+
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
927+
if (IsSoft)
928+
ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt);
929+
Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
930+
931+
if (WaitcntVsCntInstr || (!Wait.hasWaitVsCnt() && IsSoft)) {
921932
II.eraseFromParent();
922933
Modified = true;
923-
}
934+
} else
935+
WaitcntVsCntInstr = &II;
924936
}
925937
}
926938

927939
// Updated encoding of merged waitcnt with the required wait.
928940
if (WaitcntInstr) {
929-
if (Wait.hasWaitExceptVsCnt()) {
930-
Modified |=
931-
updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
932-
AMDGPU::encodeWaitcnt(IV, Wait));
933-
ScoreBrackets.applyWaitcnt(Wait);
934-
Wait.VmCnt = ~0u;
935-
Wait.LgkmCnt = ~0u;
936-
Wait.ExpCnt = ~0u;
937-
938-
LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
939-
? dbgs() << "applyPreexistingWaitcnt\n"
940-
<< "New Instr at block end: " << *WaitcntInstr
941-
<< '\n'
942-
: dbgs() << "applyPreexistingWaitcnt\n"
943-
<< "Old Instr: " << *It
944-
<< "New Instr: " << *WaitcntInstr << '\n');
941+
Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
942+
AMDGPU::encodeWaitcnt(IV, Wait));
943+
Modified |= promoteSoftWaitCnt(WaitcntInstr);
945944

946-
} else {
947-
WaitcntInstr->eraseFromParent();
948-
Modified = true;
949-
}
945+
ScoreBrackets.applyWaitcnt(Wait);
946+
Wait.VmCnt = ~0u;
947+
Wait.LgkmCnt = ~0u;
948+
Wait.ExpCnt = ~0u;
949+
950+
LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
951+
? dbgs()
952+
<< "applyPreexistingWaitcnt\n"
953+
<< "New Instr at block end: " << *WaitcntInstr << '\n'
954+
: dbgs() << "applyPreexistingWaitcnt\n"
955+
<< "Old Instr: " << *It
956+
<< "New Instr: " << *WaitcntInstr << '\n');
950957
}
951958

952959
if (WaitcntVsCntInstr) {
953-
if (Wait.hasWaitVsCnt()) {
954-
assert(ST->hasVscnt());
955-
Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
956-
AMDGPU::OpName::simm16, Wait.VsCnt);
957-
ScoreBrackets.applyWaitcnt(Wait);
958-
Wait.VsCnt = ~0u;
959-
960-
LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
961-
? dbgs() << "applyPreexistingWaitcnt\n"
962-
<< "New Instr at block end: "
963-
<< *WaitcntVsCntInstr << '\n'
964-
: dbgs() << "applyPreexistingWaitcnt\n"
965-
<< "Old Instr: " << *It
966-
<< "New Instr: " << *WaitcntVsCntInstr << '\n');
967-
} else {
968-
WaitcntVsCntInstr->eraseFromParent();
969-
Modified = true;
970-
}
960+
Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
961+
AMDGPU::OpName::simm16, Wait.VsCnt);
962+
Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
963+
ScoreBrackets.applyWaitcnt(Wait);
964+
Wait.VsCnt = ~0u;
965+
966+
LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
967+
? dbgs() << "applyPreexistingWaitcnt\n"
968+
<< "New Instr at block end: " << *WaitcntVsCntInstr
969+
<< '\n'
970+
: dbgs() << "applyPreexistingWaitcnt\n"
971+
<< "Old Instr: " << *It
972+
<< "New Instr: " << *WaitcntVsCntInstr << '\n');
971973
}
972974

973975
return Modified;
@@ -1319,7 +1321,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
13191321
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
13201322
auto SWaitInst =
13211323
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1322-
TrackedWaitcntSet.insert(SWaitInst);
13231324
Modified = true;
13241325

13251326
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1333,7 +1334,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
13331334
auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
13341335
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
13351336
.addImm(Wait.VsCnt);
1336-
TrackedWaitcntSet.insert(SWaitInst);
13371337
Modified = true;
13381338

13391339
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1581,9 +1581,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
15811581
}
15821582

15831583
static bool isWaitInstr(MachineInstr &Inst) {
1584-
return Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1585-
(Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1586-
Inst.getOperand(0).isReg() &&
1584+
auto Opcode = Inst.getOpcode();
1585+
return SIInstrInfo::isWaitcnt(Opcode) ||
1586+
(SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() &&
15871587
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
15881588
}
15891589

@@ -1852,7 +1852,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
18521852
TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
18531853
Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
18541854

1855-
TrackedWaitcntSet.clear();
18561855
BlockInfos.clear();
18571856
bool Modified = false;
18581857

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8747,6 +8747,9 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
87478747
}
87488748

87498749
int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
8750+
if (SIInstrInfo::isSoftWaitcnt(Opcode))
8751+
Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
8752+
87508753
unsigned Gen = subtargetEncodingFamily(ST);
87518754

87528755
if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,32 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
895895
return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead;
896896
}
897897

898+
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
899+
if (isWaitcnt(Opcode))
900+
return AMDGPU::S_WAITCNT;
901+
902+
if (isWaitcntVsCnt(Opcode))
903+
return AMDGPU::S_WAITCNT_VSCNT;
904+
905+
llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT");
906+
}
907+
908+
static bool isWaitcnt(unsigned Opcode) {
909+
return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft;
910+
}
911+
912+
static bool isWaitcntVsCnt(unsigned Opcode) {
913+
return Opcode == AMDGPU::S_WAITCNT_VSCNT ||
914+
Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
915+
}
916+
917+
// "Soft" waitcnt instructions can be relaxed/optimized out by
918+
// SIInsertWaitcnts.
919+
static bool isSoftWaitcnt(unsigned Opcode) {
920+
return Opcode == AMDGPU::S_WAITCNT_soft ||
921+
Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
922+
}
923+
898924
bool isVGPRCopy(const MachineInstr &MI) const {
899925
assert(isCopyInstr(MI));
900926
Register Dest = MI.getOperand(0).getReg();

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,7 +1055,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
10551055
VMCnt ? 0 : getVmcntBitMask(IV),
10561056
getExpcntBitMask(IV),
10571057
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1058-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1058+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1059+
.addImm(WaitCntImmediate);
10591060
Changed = true;
10601061
}
10611062

@@ -1963,14 +1964,15 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
19631964
VMCnt ? 0 : getVmcntBitMask(IV),
19641965
getExpcntBitMask(IV),
19651966
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1966-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1967+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1968+
.addImm(WaitCntImmediate);
19671969
Changed = true;
19681970
}
19691971

19701972
if (VSCnt) {
1971-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1972-
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1973-
.addImm(0);
1973+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
1974+
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1975+
.addImm(0);
19741976
Changed = true;
19751977
}
19761978

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1590,6 +1590,17 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
15901590

15911591
def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16",
15921592
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
1593+
1594+
// "_soft" waitcnts are waitcnts that are either relaxed into their non-soft
1595+
// counterpart, or completely removed.
1596+
//
1597+
// These are inserted by SIMemoryLegalizer to resolve memory dependencies
1598+
// and later optimized by SIInsertWaitcnts
1599+
// For example, a S_WAITCNT_soft 0 can be completely removed in a function
1600+
// that doesn't access memory.
1601+
def S_WAITCNT_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">;
1602+
def S_WAITCNT_VSCNT_soft : SOPK_WAITCNT<"s_soft_waitcnt_vscnt">;
1603+
15931604
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
15941605
[(int_amdgcn_s_sethalt timm:$simm16)]>;
15951606
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;

0 commit comments

Comments
 (0)