|
14 | 14 | #include "GCNSubtarget.h"
|
15 | 15 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
16 | 16 | #include "SIMachineFunctionInfo.h"
|
| 17 | +#include "llvm/ADT/PostOrderIterator.h" |
17 | 18 | #include "llvm/CodeGen/MachineFrameInfo.h"
|
18 | 19 | #include "llvm/CodeGen/MachineFunction.h"
|
19 | 20 | #include "llvm/CodeGen/ScheduleDAG.h"
|
@@ -44,22 +45,22 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
|
44 | 45 | cl::desc("Fill a percentage of the latency between "
|
45 | 46 | "neighboring MFMA with s_nops."));
|
46 | 47 |
|
| 48 | +static cl::opt<unsigned> MaxExhaustiveHazardSearch( |
| 49 | + "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, |
| 50 | + cl::desc("Maximum function size for exhausive hazard search")); |
| 51 | + |
47 | 52 | //===----------------------------------------------------------------------===//
|
48 | 53 | // Hazard Recognizer Implementation
|
49 | 54 | //===----------------------------------------------------------------------===//
|
50 | 55 |
|
51 | 56 | static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
|
52 | 57 | const GCNSubtarget &ST);
|
53 | 58 |
|
54 |
| -GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : |
55 |
| - IsHazardRecognizerMode(false), |
56 |
| - CurrCycleInstr(nullptr), |
57 |
| - MF(MF), |
58 |
| - ST(MF.getSubtarget<GCNSubtarget>()), |
59 |
| - TII(*ST.getInstrInfo()), |
60 |
| - TRI(TII.getRegisterInfo()), |
61 |
| - ClauseUses(TRI.getNumRegUnits()), |
62 |
| - ClauseDefs(TRI.getNumRegUnits()) { |
| 59 | +GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) |
| 60 | + : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), |
| 61 | + ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), |
| 62 | + TRI(TII.getRegisterInfo()), UseVALUReadHazardExhaustiveSearch(false), |
| 63 | + ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { |
63 | 64 | MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
|
64 | 65 | TSchedModel.init(&ST);
|
65 | 66 | RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
|
@@ -1195,6 +1196,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
|
1195 | 1196 | fixWMMAHazards(MI);
|
1196 | 1197 | fixShift64HighRegBug(MI);
|
1197 | 1198 | fixVALUMaskWriteHazard(MI);
|
| 1199 | + fixVALUReadSGPRHazard(MI); |
1198 | 1200 | fixRequiredExportPriority(MI);
|
1199 | 1201 | }
|
1200 | 1202 |
|
@@ -3010,6 +3012,263 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
|
3010 | 3012 | return true;
|
3011 | 3013 | }
|
3012 | 3014 |
|
| 3015 | +static unsigned baseSGPRNumber(Register Reg, const SIRegisterInfo &TRI) { |
| 3016 | + unsigned RegN = TRI.getEncodingValue(Reg); |
| 3017 | + assert(RegN <= 127); |
| 3018 | + return (RegN >> 1) & 0x3f; |
| 3019 | +} |
| 3020 | + |
| 3021 | +// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs. |
| 3022 | +void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) { |
| 3023 | + assert(MMF == &MF); |
| 3024 | + |
| 3025 | + // Assume non-empty vector means it has already been computed. |
| 3026 | + if (!VALUReadHazardSGPRs.empty()) |
| 3027 | + return; |
| 3028 | + |
| 3029 | + auto CallingConv = MF.getFunction().getCallingConv(); |
| 3030 | + bool IsCallFree = |
| 3031 | + AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls(); |
| 3032 | + |
| 3033 | + // Exhaustive search is only viable in non-caller/callee functions where |
| 3034 | + // VALUs will be exposed to the hazard recognizer. |
| 3035 | + UseVALUReadHazardExhaustiveSearch = |
| 3036 | + IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None && |
| 3037 | + MF.getInstructionCount() <= MaxExhaustiveHazardSearch; |
| 3038 | + |
| 3039 | + // Consider all SGPRs hazards if the shader uses function calls or is callee. |
| 3040 | + bool UseVALUUseCache = |
| 3041 | + IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None; |
| 3042 | + VALUReadHazardSGPRs.resize(64, !UseVALUUseCache); |
| 3043 | + if (!UseVALUUseCache) |
| 3044 | + return; |
| 3045 | + |
| 3046 | + // Perform a post ordered reverse scan to find VALUs which read an SGPR |
| 3047 | + // before a SALU write to the same SGPR. This provides a reduction in |
| 3048 | + // hazard insertion when all VALU access to an SGPR occurs after its last |
| 3049 | + // SALU write, when compared to a linear scan. |
| 3050 | + const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus); |
| 3051 | + const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 3052 | + BitVector SALUWriteSGPRs(64), ReadSGPRs(64); |
| 3053 | + MachineCycleInfo CI; |
| 3054 | + CI.compute(*MMF); |
| 3055 | + |
| 3056 | + for (auto *MBB : post_order(&MF)) { |
| 3057 | + bool InCycle = CI.getCycle(MBB) != nullptr; |
| 3058 | + for (auto &MI : reverse(MBB->instrs())) { |
| 3059 | + bool IsVALU = SIInstrInfo::isVALU(MI); |
| 3060 | + bool IsSALU = SIInstrInfo::isSALU(MI); |
| 3061 | + if (!(IsVALU || IsSALU)) |
| 3062 | + continue; |
| 3063 | + |
| 3064 | + for (const MachineOperand &Op : MI.operands()) { |
| 3065 | + if (!Op.isReg()) |
| 3066 | + continue; |
| 3067 | + Register Reg = Op.getReg(); |
| 3068 | + // Only consider implicit operands of VCC. |
| 3069 | + if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO || |
| 3070 | + Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC)) |
| 3071 | + continue; |
| 3072 | + if (!TRI.isSGPRReg(MRI, Reg)) |
| 3073 | + continue; |
| 3074 | + if (TRI.getEncodingValue(Reg) >= SGPR_NULL) |
| 3075 | + continue; |
| 3076 | + unsigned RegN = baseSGPRNumber(Reg, TRI); |
| 3077 | + if (IsVALU && Op.isUse()) { |
| 3078 | + // Note: any access within a cycle must be considered a hazard. |
| 3079 | + if (InCycle || (ReadSGPRs[RegN] && SALUWriteSGPRs[RegN])) |
| 3080 | + VALUReadHazardSGPRs.set(RegN); |
| 3081 | + ReadSGPRs.set(RegN); |
| 3082 | + } else if (IsSALU) { |
| 3083 | + if (Op.isDef()) |
| 3084 | + SALUWriteSGPRs.set(RegN); |
| 3085 | + else |
| 3086 | + ReadSGPRs.set(RegN); |
| 3087 | + } |
| 3088 | + } |
| 3089 | + } |
| 3090 | + } |
| 3091 | +} |
| 3092 | + |
| 3093 | +bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) { |
| 3094 | + if (!ST.hasVALUReadSGPRHazard()) |
| 3095 | + return false; |
| 3096 | + |
| 3097 | + // The hazard sequence is fundamentally three instructions: |
| 3098 | + // 1. VALU reads SGPR |
| 3099 | + // 2. SALU writes SGPR |
| 3100 | + // 3. VALU/SALU reads SGPR |
| 3101 | + // Try to avoid searching for (1) because the expiry point of the hazard is |
| 3102 | + // indeterminate; however, the hazard between (2) and (3) can expire if the |
| 3103 | + // gap contains sufficient SALU instructions with no usage of SGPR from (1). |
| 3104 | + // Note: SGPRs must be considered as 64-bit pairs as hazard exists |
| 3105 | + // even if individual SGPRs are accessed. |
| 3106 | + |
| 3107 | + bool MIIsSALU = SIInstrInfo::isSALU(*MI); |
| 3108 | + bool MIIsVALU = SIInstrInfo::isVALU(*MI); |
| 3109 | + if (!(MIIsSALU || MIIsVALU)) |
| 3110 | + return false; |
| 3111 | + |
| 3112 | + // Avoid expensive search when compile time is priority by |
| 3113 | + // mitigating every SALU which writes an SGPR. |
| 3114 | + if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) { |
| 3115 | + if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI)) |
| 3116 | + return false; |
| 3117 | + |
| 3118 | + const MachineOperand *SDSTOp = |
| 3119 | + TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); |
| 3120 | + if (!SDSTOp || !SDSTOp->isReg()) |
| 3121 | + return false; |
| 3122 | + |
| 3123 | + const Register HazardReg = SDSTOp->getReg(); |
| 3124 | + if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO || |
| 3125 | + HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0) |
| 3126 | + return false; |
| 3127 | + |
| 3128 | + // Add s_wait_alu sa_sdst(0) after SALU write. |
| 3129 | + auto NextMI = std::next(MI->getIterator()); |
| 3130 | + auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), |
| 3131 | + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) |
| 3132 | + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); |
| 3133 | + |
| 3134 | + // SALU write may be s_getpc in a bundle. |
| 3135 | + updateGetPCBundle(NewMI); |
| 3136 | + |
| 3137 | + return true; |
| 3138 | + } |
| 3139 | + |
| 3140 | + // Pre-compute set of SGPR pairs read by VALUs. |
| 3141 | + // Note: pass mutable pointer to MachineFunction for CycleInfo. |
| 3142 | + computeVALUHazardSGPRs(MI->getMF()); |
| 3143 | + |
| 3144 | + // If no VALUs hazard SGPRs exist then nothing to do. |
| 3145 | + if (VALUReadHazardSGPRs.none()) |
| 3146 | + return false; |
| 3147 | + |
| 3148 | + // All SGPR writes before a call/return must be flushed as the callee/caller |
| 3149 | + // will not will not see the hazard chain, i.e. (2) to (3) described above. |
| 3150 | + const bool IsSetPC = (MI->getOpcode() == AMDGPU::S_SETPC_B64 || |
| 3151 | + MI->getOpcode() == AMDGPU::S_SETPC_B64_return || |
| 3152 | + MI->getOpcode() == AMDGPU::S_SWAPPC_B64 || |
| 3153 | + MI->getOpcode() == AMDGPU::S_CALL_B64); |
| 3154 | + |
| 3155 | + // Collect all SGPR sources for MI which are read by a VALU. |
| 3156 | + const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus); |
| 3157 | + const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 3158 | + SmallSet<Register, 4> SGPRsUsed; |
| 3159 | + |
| 3160 | + if (!IsSetPC) { |
| 3161 | + for (const MachineOperand &Op : MI->all_uses()) { |
| 3162 | + Register OpReg = Op.getReg(); |
| 3163 | + |
| 3164 | + // Only consider VCC implicit uses on VALUs. |
| 3165 | + // The only expected SALU implicit access is SCC which is no hazard. |
| 3166 | + if (MIIsSALU && Op.isImplicit()) |
| 3167 | + continue; |
| 3168 | + |
| 3169 | + if (!TRI.isSGPRReg(MRI, OpReg)) |
| 3170 | + continue; |
| 3171 | + |
| 3172 | + // Ignore special purposes registers such as NULL, EXEC, and M0. |
| 3173 | + if (TRI.getEncodingValue(OpReg) >= SGPR_NULL) |
| 3174 | + continue; |
| 3175 | + |
| 3176 | + unsigned RegN = baseSGPRNumber(OpReg, TRI); |
| 3177 | + if (!VALUReadHazardSGPRs[RegN]) |
| 3178 | + continue; |
| 3179 | + |
| 3180 | + SGPRsUsed.insert(OpReg); |
| 3181 | + } |
| 3182 | + |
| 3183 | + // No SGPRs -> nothing to do. |
| 3184 | + if (SGPRsUsed.empty()) |
| 3185 | + return false; |
| 3186 | + } |
| 3187 | + |
| 3188 | + // A hazard is any SALU which writes one of the SGPRs read by MI. |
| 3189 | + auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) { |
| 3190 | + if (!SIInstrInfo::isSALU(I)) |
| 3191 | + return false; |
| 3192 | + // Ensure SGPR flush before call/return by conservatively assuming every |
| 3193 | + // SALU writes an SGPR. |
| 3194 | + if (IsSetPC && I.getNumDefs() > 0) |
| 3195 | + return true; |
| 3196 | + // Check for any register writes. |
| 3197 | + return llvm::any_of(SGPRsUsed, [this, &I](Register Reg) { |
| 3198 | + return I.modifiesRegister(Reg, &TRI); |
| 3199 | + }); |
| 3200 | + }; |
| 3201 | + |
| 3202 | + const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11; |
| 3203 | + auto IsExpiredFn = [&](const MachineInstr &I, int Count) { |
| 3204 | + if (Count >= SALUExpiryCount) |
| 3205 | + return true; |
| 3206 | + // s_wait_alu sa_sdst(0) on path mitigates hazard. |
| 3207 | + if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
| 3208 | + AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) |
| 3209 | + return true; |
| 3210 | + return false; |
| 3211 | + }; |
| 3212 | + |
| 3213 | + auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) { |
| 3214 | + // Only count true SALUs as wait states. |
| 3215 | + if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I)) |
| 3216 | + return 0; |
| 3217 | + // SALU must be unrelated to any hazard registers. |
| 3218 | + if (llvm::any_of(SGPRsUsed, [this, &I](Register Reg) { |
| 3219 | + return I.readsRegister(Reg, &TRI); |
| 3220 | + })) |
| 3221 | + return 0; |
| 3222 | + return 1; |
| 3223 | + }; |
| 3224 | + |
| 3225 | + // Check for the hazard. |
| 3226 | + DenseSet<const MachineBasicBlock *> Visited; |
| 3227 | + int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(), |
| 3228 | + std::next(MI->getReverseIterator()), 0, |
| 3229 | + IsExpiredFn, Visited, WaitStatesFn); |
| 3230 | + |
| 3231 | + if (WaitStates >= SALUExpiryCount) |
| 3232 | + return false; |
| 3233 | + |
| 3234 | + // Validate hazard through an exhaustive search. |
| 3235 | + if (UseVALUReadHazardExhaustiveSearch) { |
| 3236 | + // A hazard is any VALU which reads one of the paired SGPRs read by MI. |
| 3237 | + // This is searching for (1) in the hazard description. |
| 3238 | + auto hazardPair = [this](Register Reg) { |
| 3239 | + if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI) |
| 3240 | + return Register(AMDGPU::VCC); |
| 3241 | + // TODO: handle TTMP? |
| 3242 | + return Register(AMDGPU::SGPR0_SGPR1 + baseSGPRNumber(Reg, TRI)); |
| 3243 | + }; |
| 3244 | + auto SearchHazardFn = [this, hazardPair, |
| 3245 | + &SGPRsUsed](const MachineInstr &I) { |
| 3246 | + if (!SIInstrInfo::isVALU(I)) |
| 3247 | + return false; |
| 3248 | + // Check for any register reads. |
| 3249 | + return llvm::any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) { |
| 3250 | + return I.readsRegister(hazardPair(Reg), &TRI); |
| 3251 | + }); |
| 3252 | + }; |
| 3253 | + auto SearchExpiredFn = [&](const MachineInstr &I, int Count) { |
| 3254 | + return false; |
| 3255 | + }; |
| 3256 | + if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) == |
| 3257 | + std::numeric_limits<int>::max()) |
| 3258 | + return false; |
| 3259 | + } |
| 3260 | + |
| 3261 | + // Add s_wait_alu sa_sdst(0) before SALU read. |
| 3262 | + auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
| 3263 | + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) |
| 3264 | + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); |
| 3265 | + |
| 3266 | + // SALU read may be after s_getpc in a bundle. |
| 3267 | + updateGetPCBundle(NewMI); |
| 3268 | + |
| 3269 | + return true; |
| 3270 | +} |
| 3271 | + |
3013 | 3272 | static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
|
3014 | 3273 | const SIInstrInfo &TII) {
|
3015 | 3274 | MachineBasicBlock &EntryMBB = MF->front();
|
|
0 commit comments