Skip to content

Commit ebdcef2

Browse files
committed
[AMDGPU] Avoid inserting noops during scheduling
Passes that are run after the post-RA scheduler may insert instructions like waitcnt which eliminate the need for certain noops. After this patch the scheduler is still aware of possible latency from hazards but noops will not be inserted until the dedicated hazard recognizer pass is run. Depends on D89753. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D89754
1 parent 37d9078 commit ebdcef2

12 files changed

+70
-266
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -138,67 +138,71 @@ static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
138138
ScheduleHazardRecognizer::HazardType
139139
GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
140140
MachineInstr *MI = SU->getInstr();
141+
// If we are not in "HazardRecognizerMode" and therefore not being run from
142+
// the scheduler, track possible stalls from hazards but don't insert noops.
143+
auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
144+
141145
if (MI->isBundle())
142146
return NoHazard;
143147

144148
if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
145-
return NoopHazard;
149+
return HazardType;
146150

147151
// FIXME: Should flat be considered vmem?
148152
if ((SIInstrInfo::isVMEM(*MI) ||
149153
SIInstrInfo::isFLAT(*MI))
150154
&& checkVMEMHazards(MI) > 0)
151-
return NoopHazard;
155+
return HazardType;
152156

153157
if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
154-
return NoopHazard;
158+
return HazardType;
155159

156160
if (checkFPAtomicToDenormModeHazard(MI) > 0)
157-
return NoopHazard;
161+
return HazardType;
158162

159163
if (ST.hasNoDataDepHazard())
160164
return NoHazard;
161165

162166
if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
163-
return NoopHazard;
167+
return HazardType;
164168

165169
if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
166-
return NoopHazard;
170+
return HazardType;
167171

168172
if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
169-
return NoopHazard;
173+
return HazardType;
170174

171175
if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
172-
return NoopHazard;
176+
return HazardType;
173177

174178
if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
175-
return NoopHazard;
179+
return HazardType;
176180

177181
if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
178-
return NoopHazard;
182+
return HazardType;
179183

180184
if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
181-
return NoopHazard;
185+
return HazardType;
182186

183187
if (ST.hasReadM0MovRelInterpHazard() &&
184188
(TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
185189
checkReadM0Hazards(MI) > 0)
186-
return NoopHazard;
190+
return HazardType;
187191

188192
if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
189193
checkReadM0Hazards(MI) > 0)
190-
return NoopHazard;
194+
return HazardType;
191195

192196
if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
193-
return NoopHazard;
197+
return HazardType;
194198

195199
if ((SIInstrInfo::isVMEM(*MI) ||
196200
SIInstrInfo::isFLAT(*MI) ||
197201
SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
198-
return NoopHazard;
202+
return HazardType;
199203

200204
if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
201-
return NoopHazard;
205+
return HazardType;
202206

203207
return NoHazard;
204208
}
@@ -312,15 +316,19 @@ void GCNHazardRecognizer::EmitNoop() {
312316
void GCNHazardRecognizer::AdvanceCycle() {
313317
// When the scheduler detects a stall, it will call AdvanceCycle() without
314318
// emitting any instructions.
315-
if (!CurrCycleInstr)
319+
if (!CurrCycleInstr) {
320+
EmittedInstrs.push_front(nullptr);
316321
return;
322+
}
317323

318324
// Do not track non-instructions which do not affect the wait states.
319325
// If included, these instructions can lead to buffer overflow such that
320326
// detectable hazards are missed.
321327
if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
322-
CurrCycleInstr->isKill())
328+
CurrCycleInstr->isKill()) {
329+
CurrCycleInstr = nullptr;
323330
return;
331+
}
324332

325333
if (CurrCycleInstr->isBundle()) {
326334
processBundle();

llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -602,35 +602,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
602602
}
603603
}
604604

605-
// Combine adjacent s_nops to use the immediate operand encoding how long
606-
// to wait.
607-
//
608-
// s_nop N
609-
// s_nop M
610-
// =>
611-
// s_nop (N + M)
612-
if (MI.getOpcode() == AMDGPU::S_NOP &&
613-
MI.getNumOperands() == 1 && // Don't merge with implicit operands
614-
Next != MBB.end() &&
615-
(*Next).getOpcode() == AMDGPU::S_NOP &&
616-
(*Next).getNumOperands() == 1) {
617-
618-
MachineInstr &NextMI = *Next;
619-
// The instruction encodes the amount to wait with an offset of 1,
620-
// i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
621-
// after adding.
622-
uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
623-
uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
624-
625-
// Make sure we don't overflow the bounds.
626-
if (Nop0 + Nop1 <= 8) {
627-
NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
628-
MI.eraseFromParent();
629-
}
630-
631-
continue;
632-
}
633-
634605
// FIXME: We also need to consider movs of constant operands since
635606
// immediate operands are not folded if they have more than one use, and
636607
// the operand folding pass is unaware if the immediate will be free since

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(<4 x i128> addrspace(
88
; GFX9: ; %bb.0:
99
; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0
1010
; GFX9-NEXT: s_lshl_b32 m0, s4, 1
11-
; GFX9-NEXT: s_nop 0
1211
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1312
; GFX9-NEXT: s_movrels_b64 s[0:1], s[8:9]
1413
; GFX9-NEXT: s_movrels_b64 s[2:3], s[10:11]

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -887,8 +887,8 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
887887
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
888888
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2
889889
; GFX8-NEXT: s_and_b64 vcc, vcc, s[2:3]
890-
; GFX8-NEXT: s_nop 1
891890
; GFX8-NEXT: s_waitcnt vmcnt(0)
891+
; GFX8-NEXT: s_nop 0
892892
; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3
893893
; GFX8-NEXT: v_mov_b32_e32 v0, s0
894894
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -992,8 +992,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
992992
; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
993993
; GFX7-NEXT: s_mov_b32 s10, -1
994994
; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11]
995-
; GFX7-NEXT: s_nop 1
996995
; GFX7-NEXT: s_waitcnt vmcnt(0)
996+
; GFX7-NEXT: s_nop 0
997997
; GFX7-NEXT: v_div_fmas_f32 v0, v1, v2, v3
998998
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
999999
; GFX7-NEXT: s_endpgm
@@ -1026,8 +1026,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
10261026
; GFX8-NEXT: s_addc_u32 s1, s5, 0
10271027
; GFX8-NEXT: s_and_b32 s2, 1, s2
10281028
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
1029-
; GFX8-NEXT: s_nop 3
10301029
; GFX8-NEXT: s_waitcnt vmcnt(0)
1030+
; GFX8-NEXT: s_nop 2
10311031
; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3
10321032
; GFX8-NEXT: v_mov_b32_e32 v0, s0
10331033
; GFX8-NEXT: v_mov_b32_e32 v1, s1

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,10 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
4848
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
4949
; GFX8-NEXT: v_mov_b32_e32 v5, s3
5050
; GFX8-NEXT: v_mov_b32_e32 v4, s2
51-
; GFX8-NEXT: s_nop 0
5251
; GFX8-NEXT: s_waitcnt vmcnt(0)
53-
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
52+
; GFX8-NEXT: s_nop 0
5453
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
54+
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
5555
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
5656
; GFX8-NEXT: s_endpgm
5757
;

0 commit comments

Comments
 (0)