Skip to content

Commit e501ed8

Browse files
committed
[AMDGPU] Don't flush vmcnt for loops with use/def pairs
Conditions for hoisting vmcnt with flat instructions should be similar to VMEM. If there are use/def pairs in a loop body we cannot guarantee that hosting the waitcnt will be profitable. Better heuristics are needed to analyse whether gains from avoiding waitcnt in loop bodys outweighs waiting for loads in the preheader. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D151126
1 parent fb7f50a commit e501ed8

10 files changed

+329
-476
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
398398
bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
399399
bool isPreheaderToFlush(MachineBasicBlock &MBB,
400400
WaitcntBrackets &ScoreBrackets);
401+
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
401402
bool runOnMachineFunction(MachineFunction &MF) override;
402403

403404
StringRef getPassName() const override {
@@ -1703,6 +1704,11 @@ bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
17031704
return UpdateCache(false);
17041705
}
17051706

1707+
bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
1708+
return SIInstrInfo::isVMEM(MI) ||
1709+
(SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
1710+
}
1711+
17061712
// Return true if it is better to flush the vmcnt counter in the preheader of
17071713
// the given loop. We currently decide to flush in two situations:
17081714
// 1. The loop contains vmem store(s), no vmem load and at least one use of a
@@ -1721,8 +1727,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
17211727

17221728
for (MachineBasicBlock *MBB : ML->blocks()) {
17231729
for (MachineInstr &MI : *MBB) {
1724-
if (SIInstrInfo::isVMEM(MI) ||
1725-
(SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI))) {
1730+
if (isVMEMOrFlatVMEM(MI)) {
17261731
if (MI.mayLoad())
17271732
HasVMemLoad = true;
17281733
if (MI.mayStore())
@@ -1750,7 +1755,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
17501755
}
17511756
}
17521757
// VMem load vgpr def
1753-
else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef())
1758+
else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
17541759
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
17551760
// If we find a register that is loaded inside the loop, 1. and 2.
17561761
// are invalidated and we can exit.

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -755,9 +755,9 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
755755
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
756756
; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
757757
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
758-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
759758
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
760759
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
760+
; GFX90A-NEXT: s_waitcnt vmcnt(0)
761761
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
762762
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
763763
; GFX90A-NEXT: buffer_wbl2
@@ -824,9 +824,9 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
824824
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
825825
; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
826826
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
827-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
828827
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
829828
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
829+
; GFX90A-NEXT: s_waitcnt vmcnt(0)
830830
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
831831
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
832832
; GFX90A-NEXT: buffer_wbl2
@@ -947,10 +947,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
947947
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
948948
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
949949
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
950-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
951950
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
952951
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
953-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
952+
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
954953
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
955954
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
956955
; GFX90A-NEXT: buffer_wbl2
@@ -1023,10 +1022,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
10231022
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
10241023
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10251024
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1026-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
10271025
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
10281026
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1029-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1027+
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10301028
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
10311029
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
10321030
; GFX90A-NEXT: buffer_wbl2
@@ -1067,10 +1065,9 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
10671065
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10681066
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
10691067
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
1070-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
10711068
; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start
10721069
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1073-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1070+
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10741071
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
10751072
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
10761073
; GFX90A-NEXT: buffer_wbl2
@@ -1137,10 +1134,9 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
11371134
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11381135
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
11391136
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
1140-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
11411137
; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start
11421138
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1143-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1139+
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11441140
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
11451141
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
11461142
; GFX90A-NEXT: buffer_wbl2
@@ -1228,10 +1224,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
12281224
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
12291225
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
12301226
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1231-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
12321227
; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start
12331228
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1234-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1229+
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12351230
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
12361231
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
12371232
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)

llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,9 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
1010
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1111
; GFX908-NEXT: flat_load_dword v3, v[0:1]
1212
; GFX908-NEXT: s_mov_b64 s[4:5], 0
13-
; GFX908-NEXT: s_waitcnt vmcnt(0)
1413
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
1514
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
16-
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
15+
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1716
; GFX908-NEXT: v_mov_b32_e32 v4, v3
1817
; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
1918
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -34,10 +33,9 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
3433
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3534
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
3635
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
37-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
3836
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
3937
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
40-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
38+
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4139
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
4240
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
4341
; GFX90A-NEXT: buffer_wbl2
@@ -71,10 +69,9 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
7169
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
7270
; GFX1100-NEXT: flat_load_b32 v3, v[0:1]
7371
; GFX1100-NEXT: s_mov_b32 s0, 0
74-
; GFX1100-NEXT: s_waitcnt vmcnt(0)
7572
; GFX1100-NEXT: .LBB0_1: ; %atomicrmw.start
7673
; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
77-
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
74+
; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7875
; GFX1100-NEXT: v_mov_b32_e32 v4, v3
7976
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
8077
; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2
@@ -103,10 +100,9 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
103100
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
104101
; GFX908-NEXT: flat_load_dword v3, v[0:1]
105102
; GFX908-NEXT: s_mov_b64 s[4:5], 0
106-
; GFX908-NEXT: s_waitcnt vmcnt(0)
107103
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
108104
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
109-
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
105+
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
110106
; GFX908-NEXT: v_mov_b32_e32 v4, v3
111107
; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
112108
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
@@ -321,10 +317,9 @@ define float @no_unsafe(ptr %addr, float %val) {
321317
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
322318
; GFX908-NEXT: flat_load_dword v3, v[0:1]
323319
; GFX908-NEXT: s_mov_b64 s[4:5], 0
324-
; GFX908-NEXT: s_waitcnt vmcnt(0)
325320
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
326321
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
327-
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
322+
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
328323
; GFX908-NEXT: v_mov_b32_e32 v4, v3
329324
; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
330325
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
@@ -344,10 +339,9 @@ define float @no_unsafe(ptr %addr, float %val) {
344339
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
345340
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
346341
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
347-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
348342
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
349343
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
350-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
344+
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
351345
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
352346
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
353347
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
@@ -375,10 +369,9 @@ define float @no_unsafe(ptr %addr, float %val) {
375369
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
376370
; GFX1100-NEXT: flat_load_b32 v3, v[0:1]
377371
; GFX1100-NEXT: s_mov_b32 s0, 0
378-
; GFX1100-NEXT: s_waitcnt vmcnt(0)
379372
; GFX1100-NEXT: .LBB3_1: ; %atomicrmw.start
380373
; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
381-
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
374+
; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
382375
; GFX1100-NEXT: v_mov_b32_e32 v4, v3
383376
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
384377
; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2

llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
3434
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3535
; GCN-NEXT: global_load_dword v2, v[0:1], off
3636
; GCN-NEXT: s_mov_b64 s[4:5], 0
37-
; GCN-NEXT: s_waitcnt vmcnt(0)
3837
; GCN-NEXT: .LBB1_1: ; %atomicrmw.start
3938
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
39+
; GCN-NEXT: s_waitcnt vmcnt(0)
4040
; GCN-NEXT: v_mov_b32_e32 v3, v2
4141
; GCN-NEXT: v_not_b32_e32 v2, v3
4242
; GCN-NEXT: v_or_b32_e32 v2, -5, v2
@@ -62,10 +62,9 @@ define i32 @atomic_nand_i32_flat(ptr %ptr) nounwind {
6262
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6363
; GCN-NEXT: flat_load_dword v2, v[0:1]
6464
; GCN-NEXT: s_mov_b64 s[4:5], 0
65-
; GCN-NEXT: s_waitcnt vmcnt(0)
6665
; GCN-NEXT: .LBB2_1: ; %atomicrmw.start
6766
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
68-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
67+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6968
; GCN-NEXT: v_mov_b32_e32 v3, v2
7069
; GCN-NEXT: v_not_b32_e32 v2, v3
7170
; GCN-NEXT: v_or_b32_e32 v2, -5, v2

0 commit comments

Comments
 (0)