|
14 | 14 | #include "GCNSubtarget.h"
|
15 | 15 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
16 | 16 | #include "SIMachineFunctionInfo.h"
|
| 17 | +#include "llvm/ADT/PostOrderIterator.h" |
17 | 18 | #include "llvm/CodeGen/MachineFrameInfo.h"
|
18 | 19 | #include "llvm/CodeGen/MachineFunction.h"
|
19 | 20 | #include "llvm/CodeGen/ScheduleDAG.h"
|
@@ -44,22 +45,22 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
|
44 | 45 | cl::desc("Fill a percentage of the latency between "
|
45 | 46 | "neighboring MFMA with s_nops."));
|
46 | 47 |
|
| 48 | +static cl::opt<unsigned> MaxExhaustiveHazardSearch( |
| 49 | + "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, |
| 50 | + cl::desc("Maximum function size for exhausive hazard search")); |
| 51 | + |
47 | 52 | //===----------------------------------------------------------------------===//
|
48 | 53 | // Hazard Recognizer Implementation
|
49 | 54 | //===----------------------------------------------------------------------===//
|
50 | 55 |
|
51 | 56 | static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
|
52 | 57 | const GCNSubtarget &ST);
|
53 | 58 |
|
54 |
| -GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : |
55 |
| - IsHazardRecognizerMode(false), |
56 |
| - CurrCycleInstr(nullptr), |
57 |
| - MF(MF), |
58 |
| - ST(MF.getSubtarget<GCNSubtarget>()), |
59 |
| - TII(*ST.getInstrInfo()), |
60 |
| - TRI(TII.getRegisterInfo()), |
61 |
| - ClauseUses(TRI.getNumRegUnits()), |
62 |
| - ClauseDefs(TRI.getNumRegUnits()) { |
| 59 | +GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) |
| 60 | + : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), |
| 61 | + ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), |
| 62 | + TRI(TII.getRegisterInfo()), UseVALUReadHazardExhaustiveSearch(false), |
| 63 | + ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { |
63 | 64 | MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
|
64 | 65 | TSchedModel.init(&ST);
|
65 | 66 | RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
|
@@ -1204,6 +1205,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
|
1204 | 1205 | fixWMMAHazards(MI);
|
1205 | 1206 | fixShift64HighRegBug(MI);
|
1206 | 1207 | fixVALUMaskWriteHazard(MI);
|
| 1208 | + fixVALUReadSGPRHazard(MI); |
1207 | 1209 | fixRequiredExportPriority(MI);
|
1208 | 1210 | }
|
1209 | 1211 |
|
@@ -3092,6 +3094,274 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
|
3092 | 3094 | return true;
|
3093 | 3095 | }
|
3094 | 3096 |
|
| 3097 | +// Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR. |
| 3098 | +// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc |
| 3099 | +static std::optional<unsigned> sgprPairNumber(Register Reg, |
| 3100 | + const SIRegisterInfo &TRI) { |
| 3101 | + switch (Reg) { |
| 3102 | + case AMDGPU::M0: |
| 3103 | + case AMDGPU::EXEC: |
| 3104 | + case AMDGPU::EXEC_LO: |
| 3105 | + case AMDGPU::EXEC_HI: |
| 3106 | + case AMDGPU::SGPR_NULL: |
| 3107 | + case AMDGPU::SGPR_NULL64: |
| 3108 | + return {}; |
| 3109 | + default: |
| 3110 | + break; |
| 3111 | + } |
| 3112 | + unsigned RegN = TRI.getEncodingValue(Reg); |
| 3113 | + if (RegN > 127) |
| 3114 | + return {}; |
| 3115 | + return (RegN >> 1) & 0x3f; |
| 3116 | +} |
| 3117 | + |
| 3118 | +// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs. |
| 3119 | +void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) { |
| 3120 | + assert(MMF == &MF); |
| 3121 | + |
| 3122 | + // Assume non-empty vector means it has already been computed. |
| 3123 | + if (!VALUReadHazardSGPRs.empty()) |
| 3124 | + return; |
| 3125 | + |
| 3126 | + auto CallingConv = MF.getFunction().getCallingConv(); |
| 3127 | + bool IsCallFree = |
| 3128 | + AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls(); |
| 3129 | + |
| 3130 | + // Exhaustive search is only viable in non-caller/callee functions where |
| 3131 | + // VALUs will be exposed to the hazard recognizer. |
| 3132 | + UseVALUReadHazardExhaustiveSearch = |
| 3133 | + IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None && |
| 3134 | + MF.getInstructionCount() <= MaxExhaustiveHazardSearch; |
| 3135 | + |
| 3136 | + // Consider all SGPRs hazards if the shader uses function calls or is callee. |
| 3137 | + bool UseVALUUseCache = |
| 3138 | + IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None; |
| 3139 | + VALUReadHazardSGPRs.resize(64, !UseVALUUseCache); |
| 3140 | + if (!UseVALUUseCache) |
| 3141 | + return; |
| 3142 | + |
| 3143 | + // Perform a post ordered reverse scan to find VALUs which read an SGPR |
| 3144 | + // before a SALU write to the same SGPR. This provides a reduction in |
| 3145 | + // hazard insertion when all VALU access to an SGPR occurs after its last |
| 3146 | + // SALU write, when compared to a linear scan. |
| 3147 | + const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 3148 | + BitVector SALUWriteSGPRs(64), ReadSGPRs(64); |
| 3149 | + MachineCycleInfo CI; |
| 3150 | + CI.compute(*MMF); |
| 3151 | + |
| 3152 | + for (auto *MBB : post_order(&MF)) { |
| 3153 | + bool InCycle = CI.getCycle(MBB) != nullptr; |
| 3154 | + for (auto &MI : reverse(MBB->instrs())) { |
| 3155 | + bool IsVALU = SIInstrInfo::isVALU(MI); |
| 3156 | + bool IsSALU = SIInstrInfo::isSALU(MI); |
| 3157 | + if (!IsVALU && !IsSALU) |
| 3158 | + continue; |
| 3159 | + |
| 3160 | + for (const MachineOperand &Op : MI.operands()) { |
| 3161 | + if (!Op.isReg()) |
| 3162 | + continue; |
| 3163 | + Register Reg = Op.getReg(); |
| 3164 | + assert(!Op.getSubReg()); |
| 3165 | + // Only consider implicit operands of VCC. |
| 3166 | + if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO || |
| 3167 | + Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC)) |
| 3168 | + continue; |
| 3169 | + if (!TRI.isSGPRReg(MRI, Reg)) |
| 3170 | + continue; |
| 3171 | + auto RegN = sgprPairNumber(Reg, TRI); |
| 3172 | + if (!RegN) |
| 3173 | + continue; |
| 3174 | + if (IsVALU && Op.isUse()) { |
| 3175 | + // Note: any access within a cycle must be considered a hazard. |
| 3176 | + if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN])) |
| 3177 | + VALUReadHazardSGPRs.set(*RegN); |
| 3178 | + ReadSGPRs.set(*RegN); |
| 3179 | + } else if (IsSALU) { |
| 3180 | + if (Op.isDef()) |
| 3181 | + SALUWriteSGPRs.set(*RegN); |
| 3182 | + else |
| 3183 | + ReadSGPRs.set(*RegN); |
| 3184 | + } |
| 3185 | + } |
| 3186 | + } |
| 3187 | + } |
| 3188 | +} |
| 3189 | + |
| 3190 | +bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) { |
| 3191 | + if (!ST.hasVALUReadSGPRHazard()) |
| 3192 | + return false; |
| 3193 | + |
| 3194 | + // The hazard sequence is fundamentally three instructions: |
| 3195 | + // 1. VALU reads SGPR |
| 3196 | + // 2. SALU writes SGPR |
| 3197 | + // 3. VALU/SALU reads SGPR |
| 3198 | + // Try to avoid searching for (1) because the expiry point of the hazard is |
| 3199 | + // indeterminate; however, the hazard between (2) and (3) can expire if the |
| 3200 | + // gap contains sufficient SALU instructions with no usage of SGPR from (1). |
| 3201 | + // Note: SGPRs must be considered as 64-bit pairs as hazard exists |
| 3202 | + // even if individual SGPRs are accessed. |
| 3203 | + |
| 3204 | + bool MIIsSALU = SIInstrInfo::isSALU(*MI); |
| 3205 | + bool MIIsVALU = SIInstrInfo::isVALU(*MI); |
| 3206 | + if (!(MIIsSALU || MIIsVALU)) |
| 3207 | + return false; |
| 3208 | + |
| 3209 | + // Avoid expensive search when compile time is priority by |
| 3210 | + // mitigating every SALU which writes an SGPR. |
| 3211 | + if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) { |
| 3212 | + if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI)) |
| 3213 | + return false; |
| 3214 | + |
| 3215 | + const MachineOperand *SDSTOp = |
| 3216 | + TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); |
| 3217 | + if (!SDSTOp || !SDSTOp->isReg()) |
| 3218 | + return false; |
| 3219 | + |
| 3220 | + const Register HazardReg = SDSTOp->getReg(); |
| 3221 | + if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO || |
| 3222 | + HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0) |
| 3223 | + return false; |
| 3224 | + |
| 3225 | + // Add s_wait_alu sa_sdst(0) after SALU write. |
| 3226 | + auto NextMI = std::next(MI->getIterator()); |
| 3227 | + auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), |
| 3228 | + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) |
| 3229 | + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); |
| 3230 | + |
| 3231 | + // SALU write may be s_getpc in a bundle. |
| 3232 | + updateGetPCBundle(NewMI); |
| 3233 | + |
| 3234 | + return true; |
| 3235 | + } |
| 3236 | + |
| 3237 | + // Pre-compute set of SGPR pairs read by VALUs. |
| 3238 | + // Note: pass mutable pointer to MachineFunction for CycleInfo. |
| 3239 | + computeVALUHazardSGPRs(MI->getMF()); |
| 3240 | + |
| 3241 | + // If no VALUs hazard SGPRs exist then nothing to do. |
| 3242 | + if (VALUReadHazardSGPRs.none()) |
| 3243 | + return false; |
| 3244 | + |
| 3245 | + // All SGPR writes before a call/return must be flushed as the callee/caller |
| 3246 | + // will not will not see the hazard chain, i.e. (2) to (3) described above. |
| 3247 | + const bool IsSetPC = (MI->isCall() || MI->isReturn()) && |
| 3248 | + !(MI->getOpcode() == AMDGPU::S_ENDPGM || |
| 3249 | + MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED); |
| 3250 | + |
| 3251 | + // Collect all SGPR sources for MI which are read by a VALU. |
| 3252 | + const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 3253 | + SmallSet<Register, 4> SGPRsUsed; |
| 3254 | + |
| 3255 | + if (!IsSetPC) { |
| 3256 | + for (const MachineOperand &Op : MI->all_uses()) { |
| 3257 | + Register OpReg = Op.getReg(); |
| 3258 | + |
| 3259 | + // Only consider VCC implicit uses on VALUs. |
| 3260 | + // The only expected SALU implicit access is SCC which is no hazard. |
| 3261 | + if (MIIsSALU && Op.isImplicit()) |
| 3262 | + continue; |
| 3263 | + |
| 3264 | + if (!TRI.isSGPRReg(MRI, OpReg)) |
| 3265 | + continue; |
| 3266 | + |
| 3267 | + auto RegN = sgprPairNumber(OpReg, TRI); |
| 3268 | + if (!RegN) |
| 3269 | + continue; |
| 3270 | + |
| 3271 | + if (!VALUReadHazardSGPRs[*RegN]) |
| 3272 | + continue; |
| 3273 | + |
| 3274 | + SGPRsUsed.insert(OpReg); |
| 3275 | + } |
| 3276 | + |
| 3277 | + // No SGPRs -> nothing to do. |
| 3278 | + if (SGPRsUsed.empty()) |
| 3279 | + return false; |
| 3280 | + } |
| 3281 | + |
| 3282 | + // A hazard is any SALU which writes one of the SGPRs read by MI. |
| 3283 | + auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) { |
| 3284 | + if (!SIInstrInfo::isSALU(I)) |
| 3285 | + return false; |
| 3286 | + // Ensure SGPR flush before call/return by conservatively assuming every |
| 3287 | + // SALU writes an SGPR. |
| 3288 | + if (IsSetPC && I.getNumDefs() > 0) |
| 3289 | + return true; |
| 3290 | + // Check for any register writes. |
| 3291 | + return any_of(SGPRsUsed, [this, &I](Register Reg) { |
| 3292 | + return I.modifiesRegister(Reg, &TRI); |
| 3293 | + }); |
| 3294 | + }; |
| 3295 | + |
| 3296 | + const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11; |
| 3297 | + auto IsExpiredFn = [&](const MachineInstr &I, int Count) { |
| 3298 | + if (Count >= SALUExpiryCount) |
| 3299 | + return true; |
| 3300 | + // s_wait_alu sa_sdst(0) on path mitigates hazard. |
| 3301 | + if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
| 3302 | + AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) |
| 3303 | + return true; |
| 3304 | + return false; |
| 3305 | + }; |
| 3306 | + |
| 3307 | + auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) { |
| 3308 | + // Only count true SALUs as wait states. |
| 3309 | + if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I)) |
| 3310 | + return 0; |
| 3311 | + // SALU must be unrelated to any hazard registers. |
| 3312 | + if (any_of(SGPRsUsed, |
| 3313 | + [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); })) |
| 3314 | + return 0; |
| 3315 | + return 1; |
| 3316 | + }; |
| 3317 | + |
| 3318 | + // Check for the hazard. |
| 3319 | + DenseSet<const MachineBasicBlock *> Visited; |
| 3320 | + int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(), |
| 3321 | + std::next(MI->getReverseIterator()), 0, |
| 3322 | + IsExpiredFn, Visited, WaitStatesFn); |
| 3323 | + |
| 3324 | + if (WaitStates >= SALUExpiryCount) |
| 3325 | + return false; |
| 3326 | + |
| 3327 | + // Validate hazard through an exhaustive search. |
| 3328 | + if (UseVALUReadHazardExhaustiveSearch) { |
| 3329 | + // A hazard is any VALU which reads one of the paired SGPRs read by MI. |
| 3330 | + // This is searching for (1) in the hazard description. |
| 3331 | + auto hazardPair = [this](Register Reg) { |
| 3332 | + if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI) |
| 3333 | + return Register(AMDGPU::VCC); |
| 3334 | + auto RegN = sgprPairNumber(Reg, TRI); |
| 3335 | + return Register(AMDGPU::SGPR0_SGPR1 + *RegN); |
| 3336 | + }; |
| 3337 | + auto SearchHazardFn = [this, hazardPair, |
| 3338 | + &SGPRsUsed](const MachineInstr &I) { |
| 3339 | + if (!SIInstrInfo::isVALU(I)) |
| 3340 | + return false; |
| 3341 | + // Check for any register reads. |
| 3342 | + return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) { |
| 3343 | + return I.readsRegister(hazardPair(Reg), &TRI); |
| 3344 | + }); |
| 3345 | + }; |
| 3346 | + auto SearchExpiredFn = [&](const MachineInstr &I, int Count) { |
| 3347 | + return false; |
| 3348 | + }; |
| 3349 | + if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) == |
| 3350 | + std::numeric_limits<int>::max()) |
| 3351 | + return false; |
| 3352 | + } |
| 3353 | + |
| 3354 | + // Add s_wait_alu sa_sdst(0) before SALU read. |
| 3355 | + auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
| 3356 | + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) |
| 3357 | + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); |
| 3358 | + |
| 3359 | + // SALU read may be after s_getpc in a bundle. |
| 3360 | + updateGetPCBundle(NewMI); |
| 3361 | + |
| 3362 | + return true; |
| 3363 | +} |
| 3364 | + |
3095 | 3365 | static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
|
3096 | 3366 | const SIInstrInfo &TII) {
|
3097 | 3367 | MachineBasicBlock &EntryMBB = MF->front();
|
|
0 commit comments