Skip to content

[AMDGPU] Refine operand iterators in the SIInsertWaitcnts. NFCI. #108884

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 17, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 79 additions & 108 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,8 @@ class WaitcntBrackets {

RegInterval getRegInterval(const MachineInstr *MI,
const MachineRegisterInfo *MRI,
const SIRegisterInfo *TRI, unsigned OpNo) const;
const SIRegisterInfo *TRI,
const MachineOperand &Op) const;

bool counterOutOfOrder(InstCounterType T) const;
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
Expand Down Expand Up @@ -405,9 +406,9 @@ class WaitcntBrackets {
}
}

void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
unsigned OpNo, unsigned Val);
void setExpScore(const MachineInstr *MI, const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI, const MachineOperand &Op,
unsigned Val);

const GCNSubtarget *ST = nullptr;
InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
Expand Down Expand Up @@ -734,8 +735,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
const MachineRegisterInfo *MRI,
const SIRegisterInfo *TRI,
unsigned OpNo) const {
const MachineOperand &Op = MI->getOperand(OpNo);
const MachineOperand &Op) const {
if (!TRI->isInAllocatableClass(Op.getReg()))
return {-1, -1};

Expand Down Expand Up @@ -773,12 +773,11 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
}

void WaitcntBrackets::setExpScore(const MachineInstr *MI,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI, unsigned OpNo,
unsigned Val) {
RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
const MachineRegisterInfo *MRI,
const MachineOperand &Op, unsigned Val) {
RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
assert(TRI->isVectorRegister(*MRI, Op.getReg()));
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
setRegScore(RegNo, EXP_CNT, Val);
}
Expand All @@ -804,79 +803,60 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
// Put score on the source vgprs. If this is a store, just use those
// specific register(s).
if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
int AddrOpIdx =
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
// All GDS operations must protect their address register (same as
// export.)
if (AddrOpIdx != -1) {
setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
}
if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
setExpScore(&Inst, TRI, MRI, *AddrOp, CurrScore);

if (Inst.mayStore()) {
if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
CurrScore);
}
if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
setExpScore(&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
AMDGPU::OpName::data1),
CurrScore);
}
if (const auto *Data0 =
TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
setExpScore(&Inst, TRI, MRI, *Data0, CurrScore);
if (const auto *Data1 =
TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
setExpScore(&Inst, TRI, MRI, *Data1, CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
Inst.getOpcode() != AMDGPU::DS_APPEND &&
Inst.getOpcode() != AMDGPU::DS_CONSUME &&
Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
const MachineOperand &Op = Inst.getOperand(I);
if (Op.isReg() && !Op.isDef() &&
TRI->isVectorRegister(*MRI, Op.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
for (const MachineOperand &Op : Inst.all_uses()) {
if (Op.isReg() && TRI->isVectorRegister(*MRI, Op.getReg()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: don't need the isReg check.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately documentation is lying and use is not always a reg.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a reg, a simple imm:

2329│       for (const MachineOperand &Op : MI.uses()) {
2330├>        if (!TRI->isVectorRegister(*MRI, Op.getReg()))
2331│           continue;

(gdb) call Op.dump()
17408
(gdb) call MI.dump()
  renamable $vgpr3 = nofpexcept V_MAX_F16_fake16_e32 17408, killed $vgpr3, implicit $mode, implicit $exec

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah but that's uses. For all_uses it is always a reg. I know, this situation is confusing and not ideal.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not want to change it, because the very next change will be to change all_uses with just uses. Also NFCI, but of a different kind.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

JBTW, any idea why is that so? Why uses() include non-registers? And why is that documented as register uses then?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And even more to it: why all_uses does not include all of the uses?

Copy link
Collaborator Author

@rampitec rampitec Sep 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean this:

[iterator_range](https://llvm.org/doxygen/classllvm_1_1iterator__range.html)< [const_mop_iterator](https://llvm.org/doxygen/classllvm_1_1MachineInstr.html#a476971826fa13b07e28ad971ec5a3234) > 	[uses](https://llvm.org/doxygen/classllvm_1_1MachineInstr.html#a15bc8fb07e719b5a47a7c9070c5e26af) () [const](https://llvm.org/doxygen/AArch64PromoteConstant_8cpp.html#a90f8350fecae261c25be85d38b451bff)
 	Returns a range that includes all operands that are register uses.

One has really dig deep to get to this statement, which ruins it completely:

This may include unrelated operands which are not register uses. 

The high level description at the https://llvm.org/doxygen/classllvm_1_1MachineInstr.html does not include that last statement.

setExpScore(&Inst, TRI, MRI, Op, CurrScore);
}
}
} else if (TII->isFLAT(Inst)) {
if (Inst.mayStore()) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
CurrScore);
setExpScore(&Inst, TRI, MRI,
*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst)) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
CurrScore);
setExpScore(&Inst, TRI, MRI,
*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
CurrScore);
}
} else if (TII->isMIMG(Inst)) {
if (Inst.mayStore()) {
setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
setExpScore(&Inst, TRI, MRI, Inst.getOperand(0), CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst)) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
CurrScore);
setExpScore(&Inst, TRI, MRI,
*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
CurrScore);
}
} else if (TII->isMTBUF(Inst)) {
if (Inst.mayStore()) {
setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
}
if (Inst.mayStore())
setExpScore(&Inst, TRI, MRI, Inst.getOperand(0), CurrScore);
} else if (TII->isMUBUF(Inst)) {
if (Inst.mayStore()) {
setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
setExpScore(&Inst, TRI, MRI, Inst.getOperand(0), CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst)) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
CurrScore);
setExpScore(&Inst, TRI, MRI,
*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
CurrScore);
}
} else if (TII->isLDSDIR(Inst)) {
// LDSDIR instructions attach the score to the destination.
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
CurrScore);
setExpScore(&Inst, TRI, MRI,
*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst), CurrScore);
} else {
if (TII->isEXP(Inst)) {
// For export the destination registers are really temps that
Expand All @@ -891,12 +871,9 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
}
}
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
MachineOperand &MO = Inst.getOperand(I);
if (MO.isReg() && !MO.isDef() &&
TRI->isVectorRegister(*MRI, MO.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
for (const MachineOperand &Op : Inst.all_uses()) {
if (Op.isReg() && TRI->isVectorRegister(*MRI, Op.getReg()))
setExpScore(&Inst, TRI, MRI, Op, CurrScore);
}
}
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
Expand All @@ -907,14 +884,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
// artificial dependency, while these are there only for register liveness
// accounting purposes.
//
// Special cases where implicit register defs and uses exists, such as
// M0, FLAT_SCR or VCC, but the wait will be generated earlier in the
// generateWaitcntInstBefore() if that was loaded from memory.
for (unsigned I = 0, E = Inst.getNumExplicitOperands(); I != E; ++I) {
auto &Op = Inst.getOperand(I);
if (!Op.isReg() || !Op.isDef())
continue;
RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
// Special cases where implicit register defs exists, such as M0 or VCC,
// but none with memory instructions.
for (const MachineOperand &Op : Inst.defs()) {
RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
if (Interval.first >= NUM_ALL_VGPRS)
continue;
Expand Down Expand Up @@ -1692,22 +1665,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// load). We also need to check WAW dependency with saved PC.
Wait = AMDGPU::Waitcnt();

int CallAddrOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);

if (MI.getOperand(CallAddrOpIdx).isReg()) {
const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
if (CallAddrOp.isReg()) {
RegInterval CallAddrOpInterval =
ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx);
ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOp);

for (int RegNo = CallAddrOpInterval.first;
RegNo < CallAddrOpInterval.second; ++RegNo)
ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);

int RtnAddrOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
if (RtnAddrOpIdx != -1) {
if (const auto *RtnAddrOp =
TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
RegInterval RtnAddrOpInterval =
ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx);
ScoreBrackets.getRegInterval(&MI, MRI, TRI, *RtnAddrOp);

for (int RegNo = RtnAddrOpInterval.first;
RegNo < RtnAddrOpInterval.second; ++RegNo)
Expand Down Expand Up @@ -1769,16 +1739,15 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
}

// Loop over use and def operands.
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
MachineOperand &Op = MI.getOperand(I);
for (const MachineOperand &Op : MI.operands()) {
if (!Op.isReg())
continue;

// If the instruction does not read tied source, skip the operand.
if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
continue;

RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, Op);

const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
Expand Down Expand Up @@ -2357,41 +2326,43 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
if (MI.mayStore())
HasVMemStore = true;
}
for (unsigned I = 0; I < MI.getNumOperands(); I++) {
MachineOperand &Op = MI.getOperand(I);
for (const MachineOperand &Op : MI.all_uses()) {
if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
continue;
RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I);
RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
// Vgpr use
if (Op.isUse()) {
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
// If we find a register that is loaded inside the loop, 1. and 2.
// are invalidated and we can exit.
if (VgprDef.contains(RegNo))
return false;
VgprUse.insert(RegNo);
// If at least one of Op's registers is in the score brackets, the
// value is likely loaded outside of the loop.
if (Brackets.getRegScore(RegNo, LOAD_CNT) >
Brackets.getScoreLB(LOAD_CNT) ||
Brackets.getRegScore(RegNo, SAMPLE_CNT) >
Brackets.getScoreLB(SAMPLE_CNT) ||
Brackets.getRegScore(RegNo, BVH_CNT) >
Brackets.getScoreLB(BVH_CNT)) {
UsesVgprLoadedOutside = true;
break;
}
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
// If we find a register that is loaded inside the loop, 1. and 2.
// are invalidated and we can exit.
if (VgprDef.contains(RegNo))
return false;
VgprUse.insert(RegNo);
// If at least one of Op's registers is in the score brackets, the
// value is likely loaded outside of the loop.
if (Brackets.getRegScore(RegNo, LOAD_CNT) >
Brackets.getScoreLB(LOAD_CNT) ||
Brackets.getRegScore(RegNo, SAMPLE_CNT) >
Brackets.getScoreLB(SAMPLE_CNT) ||
Brackets.getRegScore(RegNo, BVH_CNT) >
Brackets.getScoreLB(BVH_CNT)) {
UsesVgprLoadedOutside = true;
break;
}
}
// VMem load vgpr def
else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
}

// VMem load vgpr def
if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
for (const MachineOperand &Op : MI.all_defs()) {
RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
// If we find a register that is loaded inside the loop, 1. and 2.
// are invalidated and we can exit.
if (VgprUse.contains(RegNo))
return false;
VgprDef.insert(RegNo);
}
}
}
}
}
Expand Down
Loading