Skip to content

Commit f014129

Browse files
shiltiankzhuravl
authored andcommitted
[AMDGPU] Mitigate GFX12 VALU read SGPR hazard (llvm#100067)
Any SGPR read by a VALU can potentially obscure SALU writes to the same register. Insert s_wait_alu instructions to mitigate the hazard on affected paths. Compute a global cache of SGPRs with any VALU reads and use this to avoid inserting mitigation for SGPRs never accessed by VALUs. To avoid excessive search when compile time is priority implement secondary mode where all SALU writes are mitigated. (cherry picked from commit 8662714)
1 parent a51ffc0 commit f014129

File tree

92 files changed

+6105
-1013
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

92 files changed

+6105
-1013
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 279 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "GCNSubtarget.h"
1515
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1616
#include "SIMachineFunctionInfo.h"
17+
#include "llvm/ADT/PostOrderIterator.h"
1718
#include "llvm/CodeGen/MachineFrameInfo.h"
1819
#include "llvm/CodeGen/MachineFunction.h"
1920
#include "llvm/CodeGen/ScheduleDAG.h"
@@ -44,22 +45,22 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
4445
cl::desc("Fill a percentage of the latency between "
4546
"neighboring MFMA with s_nops."));
4647

48+
static cl::opt<unsigned> MaxExhaustiveHazardSearch(
49+
"amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
50+
cl::desc("Maximum function size for exhausive hazard search"));
51+
4752
//===----------------------------------------------------------------------===//
4853
// Hazard Recognizer Implementation
4954
//===----------------------------------------------------------------------===//
5055

5156
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
5257
const GCNSubtarget &ST);
5358

54-
GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
55-
IsHazardRecognizerMode(false),
56-
CurrCycleInstr(nullptr),
57-
MF(MF),
58-
ST(MF.getSubtarget<GCNSubtarget>()),
59-
TII(*ST.getInstrInfo()),
60-
TRI(TII.getRegisterInfo()),
61-
ClauseUses(TRI.getNumRegUnits()),
62-
ClauseDefs(TRI.getNumRegUnits()) {
59+
GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60+
: IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61+
ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62+
TRI(TII.getRegisterInfo()), UseVALUReadHazardExhaustiveSearch(false),
63+
ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
6364
MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
6465
TSchedModel.init(&ST);
6566
RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
@@ -1204,6 +1205,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
12041205
fixWMMAHazards(MI);
12051206
fixShift64HighRegBug(MI);
12061207
fixVALUMaskWriteHazard(MI);
1208+
fixVALUReadSGPRHazard(MI);
12071209
fixRequiredExportPriority(MI);
12081210
}
12091211

@@ -3092,6 +3094,274 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
30923094
return true;
30933095
}
30943096

3097+
// Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
3098+
// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
3099+
static std::optional<unsigned> sgprPairNumber(Register Reg,
3100+
const SIRegisterInfo &TRI) {
3101+
switch (Reg) {
3102+
case AMDGPU::M0:
3103+
case AMDGPU::EXEC:
3104+
case AMDGPU::EXEC_LO:
3105+
case AMDGPU::EXEC_HI:
3106+
case AMDGPU::SGPR_NULL:
3107+
case AMDGPU::SGPR_NULL64:
3108+
return {};
3109+
default:
3110+
break;
3111+
}
3112+
unsigned RegN = TRI.getEncodingValue(Reg);
3113+
if (RegN > 127)
3114+
return {};
3115+
return (RegN >> 1) & 0x3f;
3116+
}
3117+
3118+
// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
3119+
void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
3120+
assert(MMF == &MF);
3121+
3122+
// Assume non-empty vector means it has already been computed.
3123+
if (!VALUReadHazardSGPRs.empty())
3124+
return;
3125+
3126+
auto CallingConv = MF.getFunction().getCallingConv();
3127+
bool IsCallFree =
3128+
AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
3129+
3130+
// Exhaustive search is only viable in non-caller/callee functions where
3131+
// VALUs will be exposed to the hazard recognizer.
3132+
UseVALUReadHazardExhaustiveSearch =
3133+
IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
3134+
MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
3135+
3136+
// Consider all SGPRs hazards if the shader uses function calls or is callee.
3137+
bool UseVALUUseCache =
3138+
IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
3139+
VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
3140+
if (!UseVALUUseCache)
3141+
return;
3142+
3143+
// Perform a post ordered reverse scan to find VALUs which read an SGPR
3144+
// before a SALU write to the same SGPR. This provides a reduction in
3145+
// hazard insertion when all VALU access to an SGPR occurs after its last
3146+
// SALU write, when compared to a linear scan.
3147+
const MachineRegisterInfo &MRI = MF.getRegInfo();
3148+
BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3149+
MachineCycleInfo CI;
3150+
CI.compute(*MMF);
3151+
3152+
for (auto *MBB : post_order(&MF)) {
3153+
bool InCycle = CI.getCycle(MBB) != nullptr;
3154+
for (auto &MI : reverse(MBB->instrs())) {
3155+
bool IsVALU = SIInstrInfo::isVALU(MI);
3156+
bool IsSALU = SIInstrInfo::isSALU(MI);
3157+
if (!IsVALU && !IsSALU)
3158+
continue;
3159+
3160+
for (const MachineOperand &Op : MI.operands()) {
3161+
if (!Op.isReg())
3162+
continue;
3163+
Register Reg = Op.getReg();
3164+
assert(!Op.getSubReg());
3165+
// Only consider implicit operands of VCC.
3166+
if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3167+
Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3168+
continue;
3169+
if (!TRI.isSGPRReg(MRI, Reg))
3170+
continue;
3171+
auto RegN = sgprPairNumber(Reg, TRI);
3172+
if (!RegN)
3173+
continue;
3174+
if (IsVALU && Op.isUse()) {
3175+
// Note: any access within a cycle must be considered a hazard.
3176+
if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3177+
VALUReadHazardSGPRs.set(*RegN);
3178+
ReadSGPRs.set(*RegN);
3179+
} else if (IsSALU) {
3180+
if (Op.isDef())
3181+
SALUWriteSGPRs.set(*RegN);
3182+
else
3183+
ReadSGPRs.set(*RegN);
3184+
}
3185+
}
3186+
}
3187+
}
3188+
}
3189+
3190+
bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
3191+
if (!ST.hasVALUReadSGPRHazard())
3192+
return false;
3193+
3194+
// The hazard sequence is fundamentally three instructions:
3195+
// 1. VALU reads SGPR
3196+
// 2. SALU writes SGPR
3197+
// 3. VALU/SALU reads SGPR
3198+
// Try to avoid searching for (1) because the expiry point of the hazard is
3199+
// indeterminate; however, the hazard between (2) and (3) can expire if the
3200+
// gap contains sufficient SALU instructions with no usage of SGPR from (1).
3201+
// Note: SGPRs must be considered as 64-bit pairs as hazard exists
3202+
// even if individual SGPRs are accessed.
3203+
3204+
bool MIIsSALU = SIInstrInfo::isSALU(*MI);
3205+
bool MIIsVALU = SIInstrInfo::isVALU(*MI);
3206+
if (!(MIIsSALU || MIIsVALU))
3207+
return false;
3208+
3209+
// Avoid expensive search when compile time is priority by
3210+
// mitigating every SALU which writes an SGPR.
3211+
if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
3212+
if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
3213+
return false;
3214+
3215+
const MachineOperand *SDSTOp =
3216+
TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3217+
if (!SDSTOp || !SDSTOp->isReg())
3218+
return false;
3219+
3220+
const Register HazardReg = SDSTOp->getReg();
3221+
if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3222+
HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3223+
return false;
3224+
3225+
// Add s_wait_alu sa_sdst(0) after SALU write.
3226+
auto NextMI = std::next(MI->getIterator());
3227+
auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3228+
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3229+
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3230+
3231+
// SALU write may be s_getpc in a bundle.
3232+
updateGetPCBundle(NewMI);
3233+
3234+
return true;
3235+
}
3236+
3237+
// Pre-compute set of SGPR pairs read by VALUs.
3238+
// Note: pass mutable pointer to MachineFunction for CycleInfo.
3239+
computeVALUHazardSGPRs(MI->getMF());
3240+
3241+
// If no VALUs hazard SGPRs exist then nothing to do.
3242+
if (VALUReadHazardSGPRs.none())
3243+
return false;
3244+
3245+
// All SGPR writes before a call/return must be flushed as the callee/caller
3246+
// will not will not see the hazard chain, i.e. (2) to (3) described above.
3247+
const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
3248+
!(MI->getOpcode() == AMDGPU::S_ENDPGM ||
3249+
MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3250+
3251+
// Collect all SGPR sources for MI which are read by a VALU.
3252+
const MachineRegisterInfo &MRI = MF.getRegInfo();
3253+
SmallSet<Register, 4> SGPRsUsed;
3254+
3255+
if (!IsSetPC) {
3256+
for (const MachineOperand &Op : MI->all_uses()) {
3257+
Register OpReg = Op.getReg();
3258+
3259+
// Only consider VCC implicit uses on VALUs.
3260+
// The only expected SALU implicit access is SCC which is no hazard.
3261+
if (MIIsSALU && Op.isImplicit())
3262+
continue;
3263+
3264+
if (!TRI.isSGPRReg(MRI, OpReg))
3265+
continue;
3266+
3267+
auto RegN = sgprPairNumber(OpReg, TRI);
3268+
if (!RegN)
3269+
continue;
3270+
3271+
if (!VALUReadHazardSGPRs[*RegN])
3272+
continue;
3273+
3274+
SGPRsUsed.insert(OpReg);
3275+
}
3276+
3277+
// No SGPRs -> nothing to do.
3278+
if (SGPRsUsed.empty())
3279+
return false;
3280+
}
3281+
3282+
// A hazard is any SALU which writes one of the SGPRs read by MI.
3283+
auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
3284+
if (!SIInstrInfo::isSALU(I))
3285+
return false;
3286+
// Ensure SGPR flush before call/return by conservatively assuming every
3287+
// SALU writes an SGPR.
3288+
if (IsSetPC && I.getNumDefs() > 0)
3289+
return true;
3290+
// Check for any register writes.
3291+
return any_of(SGPRsUsed, [this, &I](Register Reg) {
3292+
return I.modifiesRegister(Reg, &TRI);
3293+
});
3294+
};
3295+
3296+
const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
3297+
auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
3298+
if (Count >= SALUExpiryCount)
3299+
return true;
3300+
// s_wait_alu sa_sdst(0) on path mitigates hazard.
3301+
if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3302+
AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3303+
return true;
3304+
return false;
3305+
};
3306+
3307+
auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
3308+
// Only count true SALUs as wait states.
3309+
if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
3310+
return 0;
3311+
// SALU must be unrelated to any hazard registers.
3312+
if (any_of(SGPRsUsed,
3313+
[this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
3314+
return 0;
3315+
return 1;
3316+
};
3317+
3318+
// Check for the hazard.
3319+
DenseSet<const MachineBasicBlock *> Visited;
3320+
int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
3321+
std::next(MI->getReverseIterator()), 0,
3322+
IsExpiredFn, Visited, WaitStatesFn);
3323+
3324+
if (WaitStates >= SALUExpiryCount)
3325+
return false;
3326+
3327+
// Validate hazard through an exhaustive search.
3328+
if (UseVALUReadHazardExhaustiveSearch) {
3329+
// A hazard is any VALU which reads one of the paired SGPRs read by MI.
3330+
// This is searching for (1) in the hazard description.
3331+
auto hazardPair = [this](Register Reg) {
3332+
if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3333+
return Register(AMDGPU::VCC);
3334+
auto RegN = sgprPairNumber(Reg, TRI);
3335+
return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3336+
};
3337+
auto SearchHazardFn = [this, hazardPair,
3338+
&SGPRsUsed](const MachineInstr &I) {
3339+
if (!SIInstrInfo::isVALU(I))
3340+
return false;
3341+
// Check for any register reads.
3342+
return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
3343+
return I.readsRegister(hazardPair(Reg), &TRI);
3344+
});
3345+
};
3346+
auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
3347+
return false;
3348+
};
3349+
if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
3350+
std::numeric_limits<int>::max())
3351+
return false;
3352+
}
3353+
3354+
// Add s_wait_alu sa_sdst(0) before SALU read.
3355+
auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3356+
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3357+
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3358+
3359+
// SALU read may be after s_getpc in a bundle.
3360+
updateGetPCBundle(NewMI);
3361+
3362+
return true;
3363+
}
3364+
30953365
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
30963366
const SIInstrInfo &TII) {
30973367
MachineBasicBlock &EntryMBB = MF->front();

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
4848
const SIRegisterInfo &TRI;
4949
TargetSchedModel TSchedModel;
5050
bool RunLdsBranchVmemWARHazardFixup;
51+
BitVector VALUReadHazardSGPRs;
52+
bool UseVALUReadHazardExhaustiveSearch;
5153

5254
/// RegUnits of uses in the current soft memory clause.
5355
BitVector ClauseUses;
@@ -107,6 +109,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
107109
bool fixWMMAHazards(MachineInstr *MI);
108110
bool fixShift64HighRegBug(MachineInstr *MI);
109111
bool fixVALUMaskWriteHazard(MachineInstr *MI);
112+
void computeVALUHazardSGPRs(MachineFunction *MMF);
113+
bool fixVALUReadSGPRHazard(MachineInstr *MI);
110114
bool fixRequiredExportPriority(MachineInstr *MI);
111115

112116
int checkMAIHazards(MachineInstr *MI);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1273,6 +1273,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12731273

12741274
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
12751275

1276+
bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
1277+
12761278
/// Return if operations acting on VGPR tuples require even alignment.
12771279
bool needsAlignedVGPRs() const { return GFX90AInsts; }
12781280

0 commit comments

Comments
 (0)