Skip to content

Commit eec828f

Browse files
committed
[AMDGPU] Mitigate GFX12 VALU read SGPR hazard
Any SGPR read by a VALU can potentially obscure SALU writes to the same register. Insert s_wait_alu instructions to mitigate the hazard on affected paths. Compute a global cache of SGPRs with any VALU reads and use this to avoid inserting mitigation for SGPRs never accessed by VALUs. To avoid excessive search when compile time is priority implement secondary mode where all SALU writes are mitigated.
1 parent 4ac42af commit eec828f

File tree

91 files changed

+5948
-932
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

91 files changed

+5948
-932
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 299 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "GCNSubtarget.h"
1515
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1616
#include "SIMachineFunctionInfo.h"
17+
#include "llvm/ADT/PostOrderIterator.h"
1718
#include "llvm/CodeGen/MachineFrameInfo.h"
1819
#include "llvm/CodeGen/MachineFunction.h"
1920
#include "llvm/CodeGen/ScheduleDAG.h"
@@ -44,22 +45,22 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
4445
cl::desc("Fill a percentage of the latency between "
4546
"neighboring MFMA with s_nops."));
4647

48+
static cl::opt<unsigned> MaxExhaustiveHazardSearch(
49+
"amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
50+
cl::desc("Maximum function size for exhausive hazard search"));
51+
4752
//===----------------------------------------------------------------------===//
4853
// Hazard Recognizer Implementation
4954
//===----------------------------------------------------------------------===//
5055

5156
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
5257
const GCNSubtarget &ST);
5358

54-
GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
55-
IsHazardRecognizerMode(false),
56-
CurrCycleInstr(nullptr),
57-
MF(MF),
58-
ST(MF.getSubtarget<GCNSubtarget>()),
59-
TII(*ST.getInstrInfo()),
60-
TRI(TII.getRegisterInfo()),
61-
ClauseUses(TRI.getNumRegUnits()),
62-
ClauseDefs(TRI.getNumRegUnits()) {
59+
GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60+
: IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61+
ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62+
TRI(TII.getRegisterInfo()), UseVALUReadHazardExhaustiveSearch(false),
63+
ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
6364
MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
6465
TSchedModel.init(&ST);
6566
RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
@@ -1105,6 +1106,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
11051106
fixWMMAHazards(MI);
11061107
fixShift64HighRegBug(MI);
11071108
fixVALUMaskWriteHazard(MI);
1109+
fixVALUReadSGPRHazard(MI);
11081110
fixRequiredExportPriority(MI);
11091111
}
11101112

@@ -2761,6 +2763,36 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
27612763
return false;
27622764
}
27632765

2766+
// Adjust global offsets for instructions bundled with S_GETPC_B64 after
2767+
// insertion of a new instruction.
2768+
static void updateGetPCBundle(MachineInstr *NewMI) {
2769+
if (!NewMI->isBundled())
2770+
return;
2771+
2772+
// Find start of bundle.
2773+
auto I = NewMI->getIterator();
2774+
while (I->isBundledWithPred())
2775+
I--;
2776+
if (I->isBundle())
2777+
I++;
2778+
2779+
// Bail if this is not an S_GETPC bundle.
2780+
if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2781+
return;
2782+
2783+
// Update offsets of any references in the bundle.
2784+
const unsigned NewBytes = NewMI->getDesc().getSize();
2785+
auto NextMI = std::next(NewMI->getIterator());
2786+
auto End = NewMI->getParent()->end();
2787+
while (NextMI != End && NextMI->isBundledWithPred()) {
2788+
for (auto &Operand : NextMI->operands()) {
2789+
if (Operand.isGlobal())
2790+
Operand.setOffset(Operand.getOffset() + NewBytes);
2791+
}
2792+
NextMI++;
2793+
}
2794+
}
2795+
27642796
bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
27652797
if (!ST.hasVALUMaskWriteHazard())
27662798
return false;
@@ -2878,22 +2910,269 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
28782910
auto NextMI = std::next(MI->getIterator());
28792911

28802912
// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2881-
BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2882-
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2883-
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2913+
auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2914+
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2915+
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
28842916

28852917
// SALU write may be s_getpc in a bundle.
2886-
if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2887-
// Update offsets of any references in the bundle.
2888-
while (NextMI != MI->getParent()->end() &&
2889-
NextMI->isBundledWithPred()) {
2890-
for (auto &Operand : NextMI->operands()) {
2891-
if (Operand.isGlobal())
2892-
Operand.setOffset(Operand.getOffset() + 4);
2918+
updateGetPCBundle(NewMI);
2919+
2920+
return true;
2921+
}
2922+
2923+
static unsigned baseSGPRNumber(Register Reg, const SIRegisterInfo &TRI) {
2924+
unsigned RegN = TRI.getEncodingValue(Reg);
2925+
assert(RegN <= 127);
2926+
return (RegN >> 1) & 0x3f;
2927+
}
2928+
2929+
// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
2930+
void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
2931+
assert(MMF == &MF);
2932+
2933+
// Assume non-empty vector means it has already been computed.
2934+
if (!VALUReadHazardSGPRs.empty())
2935+
return;
2936+
2937+
auto CallingConv = MF.getFunction().getCallingConv();
2938+
bool IsCallFree =
2939+
AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
2940+
2941+
// Exhaustive search is only viable in non-caller/callee functions where
2942+
// VALUs will be exposed to the hazard recognizer.
2943+
UseVALUReadHazardExhaustiveSearch =
2944+
IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
2945+
MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
2946+
2947+
// Consider all SGPRs hazards if the shader uses function calls or is callee.
2948+
bool UseVALUUseCache =
2949+
IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
2950+
VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
2951+
if (!UseVALUUseCache)
2952+
return;
2953+
2954+
// Perform a post ordered reverse scan to find VALUs which read an SGPR
2955+
// before a SALU write to the same SGPR. This provides a reduction in
2956+
// hazard insertion when all VALU access to an SGPR occurs after its last
2957+
// SALU write, when compared to a linear scan.
2958+
const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
2959+
const MachineRegisterInfo &MRI = MF.getRegInfo();
2960+
BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
2961+
MachineCycleInfo CI;
2962+
CI.compute(*MMF);
2963+
2964+
for (auto *MBB : post_order(&MF)) {
2965+
bool InCycle = CI.getCycle(MBB) != nullptr;
2966+
for (auto &MI : reverse(MBB->instrs())) {
2967+
bool IsVALU = SIInstrInfo::isVALU(MI);
2968+
bool IsSALU = SIInstrInfo::isSALU(MI);
2969+
if (!(IsVALU || IsSALU))
2970+
continue;
2971+
2972+
for (const MachineOperand &Op : MI.operands()) {
2973+
if (!Op.isReg())
2974+
continue;
2975+
Register Reg = Op.getReg();
2976+
// Only consider implicit operands of VCC.
2977+
if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
2978+
Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
2979+
continue;
2980+
if (!TRI.isSGPRReg(MRI, Reg))
2981+
continue;
2982+
if (TRI.getEncodingValue(Reg) >= SGPR_NULL)
2983+
continue;
2984+
unsigned RegN = baseSGPRNumber(Reg, TRI);
2985+
if (IsVALU && Op.isUse()) {
2986+
// Note: any access within a cycle must be considered a hazard.
2987+
if (InCycle || (ReadSGPRs[RegN] && SALUWriteSGPRs[RegN]))
2988+
VALUReadHazardSGPRs.set(RegN);
2989+
ReadSGPRs.set(RegN);
2990+
} else if (IsSALU) {
2991+
if (Op.isDef())
2992+
SALUWriteSGPRs.set(RegN);
2993+
else
2994+
ReadSGPRs.set(RegN);
2995+
}
28932996
}
2894-
NextMI++;
28952997
}
28962998
}
2999+
}
3000+
3001+
bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
3002+
if (!ST.hasVALUReadSGPRHazard())
3003+
return false;
3004+
3005+
// The hazard sequence is fundamentally three instructions:
3006+
// 1. VALU reads SGPR
3007+
// 2. SALU writes SGPR
3008+
// 3. VALU/SALU reads SGPR
3009+
// Try to avoid searching for (1) because the expiry point of the hazard is
3010+
// indeterminate; however, the hazard between (2) and (3) can expire if the
3011+
// gap contains sufficient SALU instructions with no usage of SGPR from (1).
3012+
// Note: SGPRs must be considered as 64-bit pairs as hazard exists
3013+
// even if individual SGPRs are accessed.
3014+
3015+
bool MIIsSALU = SIInstrInfo::isSALU(*MI);
3016+
bool MIIsVALU = SIInstrInfo::isVALU(*MI);
3017+
if (!(MIIsSALU || MIIsVALU))
3018+
return false;
3019+
3020+
// Avoid expensive search when compile time is priority by
3021+
// mitigating every SALU which writes an SGPR.
3022+
if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
3023+
if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
3024+
return false;
3025+
3026+
const MachineOperand *SDSTOp =
3027+
TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3028+
if (!SDSTOp || !SDSTOp->isReg())
3029+
return false;
3030+
3031+
const Register HazardReg = SDSTOp->getReg();
3032+
if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3033+
HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3034+
return false;
3035+
3036+
// Add s_wait_alu sa_sdst(0) after SALU write.
3037+
auto NextMI = std::next(MI->getIterator());
3038+
auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3039+
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3040+
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3041+
3042+
// SALU write may be s_getpc in a bundle.
3043+
updateGetPCBundle(NewMI);
3044+
3045+
return true;
3046+
}
3047+
3048+
// Pre-compute set of SGPR pairs read by VALUs.
3049+
// Note: pass mutable pointer to MachineFunction for CycleInfo.
3050+
computeVALUHazardSGPRs(MI->getMF());
3051+
3052+
// If no VALUs hazard SGPRs exist then nothing to do.
3053+
if (VALUReadHazardSGPRs.none())
3054+
return false;
3055+
3056+
// All SGPR writes before a call/return must be flushed as the callee/caller
3057+
// will not will not see the hazard chain, i.e. (2) to (3) described above.
3058+
const bool IsSetPC = (MI->getOpcode() == AMDGPU::S_SETPC_B64 ||
3059+
MI->getOpcode() == AMDGPU::S_SETPC_B64_return ||
3060+
MI->getOpcode() == AMDGPU::S_SWAPPC_B64 ||
3061+
MI->getOpcode() == AMDGPU::S_CALL_B64);
3062+
3063+
// Collect all SGPR sources for MI which are read by a VALU.
3064+
const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
3065+
const MachineRegisterInfo &MRI = MF.getRegInfo();
3066+
SmallSet<Register, 4> SGPRsUsed;
3067+
3068+
if (!IsSetPC) {
3069+
for (const MachineOperand &Op : MI->all_uses()) {
3070+
Register OpReg = Op.getReg();
3071+
3072+
// Only consider VCC implicit uses on VALUs.
3073+
// The only expected SALU implicit access is SCC which is no hazard.
3074+
if (MIIsSALU && Op.isImplicit())
3075+
continue;
3076+
3077+
if (!TRI.isSGPRReg(MRI, OpReg))
3078+
continue;
3079+
3080+
// Ignore special purposes registers such as NULL, EXEC, and M0.
3081+
if (TRI.getEncodingValue(OpReg) >= SGPR_NULL)
3082+
continue;
3083+
3084+
unsigned RegN = baseSGPRNumber(OpReg, TRI);
3085+
if (!VALUReadHazardSGPRs[RegN])
3086+
continue;
3087+
3088+
SGPRsUsed.insert(OpReg);
3089+
}
3090+
3091+
// No SGPRs -> nothing to do.
3092+
if (SGPRsUsed.empty())
3093+
return false;
3094+
}
3095+
3096+
// A hazard is any SALU which writes one of the SGPRs read by MI.
3097+
auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
3098+
if (!SIInstrInfo::isSALU(I))
3099+
return false;
3100+
// Ensure SGPR flush before call/return by conservatively assuming every
3101+
// SALU writes an SGPR.
3102+
if (IsSetPC && I.getNumDefs() > 0)
3103+
return true;
3104+
// Check for any register writes.
3105+
return llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
3106+
return I.modifiesRegister(Reg, &TRI);
3107+
});
3108+
};
3109+
3110+
const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
3111+
auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
3112+
if (Count >= SALUExpiryCount)
3113+
return true;
3114+
// s_wait_alu sa_sdst(0) on path mitigates hazard.
3115+
if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3116+
AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3117+
return true;
3118+
return false;
3119+
};
3120+
3121+
auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
3122+
// Only count true SALUs as wait states.
3123+
if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
3124+
return 0;
3125+
// SALU must be unrelated to any hazard registers.
3126+
if (llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
3127+
return I.readsRegister(Reg, &TRI);
3128+
}))
3129+
return 0;
3130+
return 1;
3131+
};
3132+
3133+
// Check for the hazard.
3134+
DenseSet<const MachineBasicBlock *> Visited;
3135+
int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
3136+
std::next(MI->getReverseIterator()), 0,
3137+
IsExpiredFn, Visited, WaitStatesFn);
3138+
3139+
if (WaitStates >= SALUExpiryCount)
3140+
return false;
3141+
3142+
// Validate hazard through an exhaustive search.
3143+
if (UseVALUReadHazardExhaustiveSearch) {
3144+
// A hazard is any VALU which reads one of the paired SGPRs read by MI.
3145+
// This is searching for (1) in the hazard description.
3146+
auto hazardPair = [this](Register Reg) {
3147+
if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3148+
return Register(AMDGPU::VCC);
3149+
// TODO: handle TTMP?
3150+
return Register(AMDGPU::SGPR0_SGPR1 + baseSGPRNumber(Reg, TRI));
3151+
};
3152+
auto SearchHazardFn = [this, hazardPair,
3153+
&SGPRsUsed](const MachineInstr &I) {
3154+
if (!SIInstrInfo::isVALU(I))
3155+
return false;
3156+
// Check for any register reads.
3157+
return llvm::any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
3158+
return I.readsRegister(hazardPair(Reg), &TRI);
3159+
});
3160+
};
3161+
auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
3162+
return false;
3163+
};
3164+
if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
3165+
std::numeric_limits<int>::max())
3166+
return false;
3167+
}
3168+
3169+
// Add s_wait_alu sa_sdst(0) before SALU read.
3170+
auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3171+
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3172+
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3173+
3174+
// SALU read may be after s_getpc in a bundle.
3175+
updateGetPCBundle(NewMI);
28973176

28983177
return true;
28993178
}

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
4848
const SIRegisterInfo &TRI;
4949
TargetSchedModel TSchedModel;
5050
bool RunLdsBranchVmemWARHazardFixup;
51+
BitVector VALUReadHazardSGPRs;
52+
bool UseVALUReadHazardExhaustiveSearch;
5153

5254
/// RegUnits of uses in the current soft memory clause.
5355
BitVector ClauseUses;
@@ -107,6 +109,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
107109
bool fixWMMAHazards(MachineInstr *MI);
108110
bool fixShift64HighRegBug(MachineInstr *MI);
109111
bool fixVALUMaskWriteHazard(MachineInstr *MI);
112+
void computeVALUHazardSGPRs(MachineFunction *MMF);
113+
bool fixVALUReadSGPRHazard(MachineInstr *MI);
110114
bool fixRequiredExportPriority(MachineInstr *MI);
111115

112116
int checkMAIHazards(MachineInstr *MI);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1246,6 +1246,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12461246

12471247
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
12481248

1249+
bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
1250+
12491251
/// Return if operations acting on VGPR tuples require even alignment.
12501252
bool needsAlignedVGPRs() const { return GFX90AInsts; }
12511253

0 commit comments

Comments
 (0)