Skip to content

[AMDGPU] Ignore inactive VGPRs in .vgpr_count #144855

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// dispatch registers are function args.
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;

if (isShader(F.getCallingConv())) {
if (AMDGPU::shouldReportUnusedFuncArgs(F.getCallingConv())) {
bool IsPixelShader =
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();

Expand Down
299 changes: 44 additions & 255 deletions llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,268 +139,56 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(

Info.UsesVCC =
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
/*IncludeCalls=*/false);
if (ST.hasMAIInsts())
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
/*IncludeCalls=*/false);

// If there are no calls, MachineRegisterInfo can tell us the used register
// count easily.
// A tail call isn't considered a call for MachineFrameInfo's purposes.
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
if (ST.hasMAIInsts())
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
bool HasCalls = FrameInfo.hasCalls() || FrameInfo.hasTailCall();
// Functions that use the llvm.amdgcn.init.whole.wave intrinsic often have
// VGPR arguments that are only added for the purpose of preserving the
// inactive lanes. These should not be included in the number of used VGPRs.
bool NeedsExplicitVGPRCount = MFI->hasInitWholeWave();
if (!HasCalls && !NeedsExplicitVGPRCount) {

Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
/*IncludeCalls=*/false);
return Info;
}

int32_t MaxVGPR = -1;
int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
Info.CalleeSegmentSize = 0;

for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
// TODO: Check regmasks? Do they occur anywhere except calls?
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
bool IsAGPR = false;

if (!MO.isReg())
continue;

Register Reg = MO.getReg();
switch (Reg) {
case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
case AMDGPU::EXEC_HI:
case AMDGPU::SCC:
case AMDGPU::M0:
case AMDGPU::M0_LO16:
case AMDGPU::M0_HI16:
case AMDGPU::SRC_SHARED_BASE_LO:
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_SHARED_LIMIT_LO:
case AMDGPU::SRC_SHARED_LIMIT:
case AMDGPU::SRC_PRIVATE_BASE_LO:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
case AMDGPU::SRC_PRIVATE_LIMIT:
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
case AMDGPU::SGPR_NULL:
case AMDGPU::SGPR_NULL64:
case AMDGPU::MODE:
continue;

case AMDGPU::NoRegister:
assert(MI.isDebugInstr() &&
"Instruction uses invalid noreg register");
continue;

case AMDGPU::VCC:
case AMDGPU::VCC_LO:
case AMDGPU::VCC_HI:
case AMDGPU::VCC_LO_LO16:
case AMDGPU::VCC_LO_HI16:
case AMDGPU::VCC_HI_LO16:
case AMDGPU::VCC_HI_HI16:
Info.UsesVCC = true;
continue;

case AMDGPU::FLAT_SCR:
case AMDGPU::FLAT_SCR_LO:
case AMDGPU::FLAT_SCR_HI:
continue;

case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
llvm_unreachable("xnack_mask registers should not be used");

case AMDGPU::LDS_DIRECT:
llvm_unreachable("lds_direct register should not be used");

case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
case AMDGPU::TMA:
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
llvm_unreachable("trap handler registers should not be used");

case AMDGPU::SRC_VCCZ:
llvm_unreachable("src_vccz register should not be used");

case AMDGPU::SRC_EXECZ:
llvm_unreachable("src_execz register should not be used");

case AMDGPU::SRC_SCC:
llvm_unreachable("src_scc register should not be used");

default:
break;
}

if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
IsSGPR = true;
Width = 1;
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
AMDGPU::VGPR_16RegClass.contains(Reg)) {
IsSGPR = false;
Width = 1;
} else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 1;
} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
IsSGPR = true;
Width = 2;
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
IsSGPR = false;
Width = 2;
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
Width = 3;
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
IsSGPR = true;
Width = 3;
} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 3;
} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
IsSGPR = true;
Width = 4;
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
IsSGPR = false;
Width = 4;
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 4;
} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
IsSGPR = false;
Width = 5;
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
IsSGPR = true;
Width = 5;
} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 5;
} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
IsSGPR = false;
Width = 6;
} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
IsSGPR = true;
Width = 6;
} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 6;
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
IsSGPR = false;
Width = 7;
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
IsSGPR = true;
Width = 7;
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 7;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
IsSGPR = true;
Width = 8;
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
IsSGPR = false;
Width = 8;
} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 8;
} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
IsSGPR = false;
Width = 9;
} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
IsSGPR = true;
Width = 9;
} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 9;
} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
IsSGPR = false;
Width = 10;
} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
IsSGPR = true;
Width = 10;
} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 10;
} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
IsSGPR = false;
Width = 11;
} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
IsSGPR = true;
Width = 11;
} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 11;
} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
IsSGPR = false;
Width = 12;
} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
IsSGPR = true;
Width = 12;
} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 12;
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
IsSGPR = true;
Width = 16;
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
IsSGPR = false;
Width = 16;
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 16;
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
IsSGPR = true;
Width = 32;
} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
Width = 32;
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 32;
} else {
// We only expect TTMP registers or registers that do not belong to
// any RC.
assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
AMDGPU::TTMP_64RegClass.contains(Reg) ||
AMDGPU::TTMP_128RegClass.contains(Reg) ||
AMDGPU::TTMP_256RegClass.contains(Reg) ||
AMDGPU::TTMP_512RegClass.contains(Reg) ||
!TRI.getPhysRegBaseClass(Reg)) &&
"Unknown register class");
}
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
if (IsSGPR) {
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
} else if (IsAGPR) {
MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
} else {
if (NeedsExplicitVGPRCount) {
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
const MachineOperand &MO = MI.getOperand(i);

if (!MO.isReg())
continue;
Register Reg = MO.getReg();
const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);

if (!RC || !TRI.isVGPRClass(RC))
continue;

// Skip inactive VGPRs in chain functions with the init.whole.wave
// intrinsic. These will only appear as implicit use operands on the
// chain call, and as the def of an IMPLICIT_DEF. We're going to skip
// implicit defs unconditionally though because if they're important
// in a different context then they will be counted when they are
// used.
bool IsChainCall =
MFI->isChainFunction() && MI.getOpcode() == AMDGPU::SI_TCRETURN;
if (IsChainCall || MI.isImplicitDef())
continue;

unsigned Width = TRI.getRegSizeInBits(*RC) / 32;
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
}
}
Expand Down Expand Up @@ -464,9 +252,10 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
}
}

Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
Info.NumAGPR = MaxAGPR + 1;
if (NeedsExplicitVGPRCount)
Info.NumVGPR = MaxVGPR + 1;
else
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass, false);

return Info;
}
8 changes: 4 additions & 4 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4046,11 +4046,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
return 0;
}

unsigned
SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const {
unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC,
bool IncludeCalls) const {
for (MCPhysReg Reg : reverse(RC.getRegisters()))
if (MRI.isPhysRegUsed(Reg))
if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
return getHWRegIndex(Reg) + 1;
return 0;
}
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -482,9 +482,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned SubReg) const;

// \returns a number of registers of a given \p RC used in a function.
// Does not go inside function calls.
// Does not go inside function calls. If \p IncludeCalls is true, it will
// include registers that may be clobbered by calls.
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const;
const TargetRegisterClass &RC,
bool IncludeCalls = true) const;

std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1351,6 +1351,28 @@ constexpr bool isEntryFunctionCC(CallingConv::ID CC) {
}
}

// Shaders that are entry functions need to count input arguments even if
// they're not used (i.e. not reported by AMDGPUResourceUsageAnalysis). Other
// functions can skip including them. This is especially important for shaders
// that use the init.whole.wave intrinsic, since they sometimes have VGPR
// arguments that are only added for the purpose of preserving their inactive
// lanes and should not be included in the vgpr-count.
LLVM_READNONE
constexpr bool shouldReportUnusedFuncArgs(CallingConv::ID CC) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Name should express the reason, not the usage context. Although here I don't understand why you're going out of your way to exclude kernels. The same reasoning should apply when using preloaded arguments

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you suggest a better name? This is mostly just an implementation detail. Maybe it shouldn't be in AMDGPUBaseInfo in the first place. Should I just move it to AMDGPUAsmPrinter.cpp?

Although here I don't understand why you're going out of your way to exclude kernels. The same reasoning should apply when using preloaded arguments

Graphics and kernels handle hardware-initialized registers a bit differently. For graphics, we're putting them as arguments to the IR functions, and for compute we track them in SIMachineFunctionInfo instead. We do handle the preloaded arguments in the same place in AMDGPUAsmPrinter, just on the else branch of where this helper is used.

switch (CC) {
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_LS:
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
return true;
default:
return false;
}
}

LLVM_READNONE
constexpr bool isChainCC(CallingConv::ID CC) {
switch (CC) {
Expand Down
Loading
Loading