Skip to content

Commit 94def1b

Browse files
committed
[AMDGPU] Do not exapnd fp atomics on gfx940
FP atomics are safe on gfx940. This fixes regression after D131560. Fixes: SWDEV-380468 Differential Revision: https://reviews.llvm.org/D143603
1 parent e887627 commit 94def1b

File tree

7 files changed

+59
-411
lines changed

7 files changed

+59
-411
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12942,6 +12942,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1294212942

1294312943
if (AMDGPU::isFlatGlobalAddrSpace(AS) &&
1294412944
Subtarget->hasAtomicFaddNoRtnInsts()) {
12945+
if (Subtarget->hasGFX940Insts())
12946+
return AtomicExpansionKind::None;
12947+
1294512948
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
1294612949
return AtomicExpansionKind::CmpXChg;
1294712950

llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll

Lines changed: 6 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -28,26 +28,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
2828
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
2929
; GFX940: ; %bb.0:
3030
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
31-
; GFX940-NEXT: s_mov_b64 s[2:3], 0
31+
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
3232
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
3333
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
34-
; GFX940-NEXT: flat_load_dword v1, v[0:1]
35-
; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
36-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
37-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
38-
; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
39-
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
4034
; GFX940-NEXT: buffer_wbl2 sc0 sc1
4135
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
42-
; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
36+
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
4337
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4438
; GFX940-NEXT: buffer_inv sc0 sc1
45-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
46-
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
47-
; GFX940-NEXT: v_mov_b32_e32 v1, v0
48-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
49-
; GFX940-NEXT: s_cbranch_execnz .LBB1_1
50-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
5139
; GFX940-NEXT: s_endpgm
5240
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
5341
ret void
@@ -57,26 +45,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
5745
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
5846
; GFX940: ; %bb.0:
5947
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
60-
; GFX940-NEXT: s_mov_b64 s[2:3], 0
48+
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
6149
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
6250
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
63-
; GFX940-NEXT: flat_load_dword v1, v[0:1]
64-
; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
65-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
66-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
67-
; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
68-
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
6951
; GFX940-NEXT: buffer_wbl2 sc0 sc1
7052
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
71-
; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
53+
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
7254
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7355
; GFX940-NEXT: buffer_inv sc0 sc1
74-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
75-
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
76-
; GFX940-NEXT: v_mov_b32_e32 v1, v0
77-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
78-
; GFX940-NEXT: s_cbranch_execnz .LBB2_1
79-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
8056
; GFX940-NEXT: s_endpgm
8157
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
8258
ret void
@@ -97,25 +73,12 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
9773
; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
9874
; GFX940: ; %bb.0:
9975
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100-
; GFX940-NEXT: flat_load_dword v2, v[0:1]
101-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
102-
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
103-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
104-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
105-
; GFX940-NEXT: v_mov_b32_e32 v3, v2
106-
; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3
76+
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
10777
; GFX940-NEXT: buffer_wbl2 sc0 sc1
10878
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
109-
; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
79+
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
11080
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11181
; GFX940-NEXT: buffer_inv sc0 sc1
112-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
113-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
114-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
115-
; GFX940-NEXT: s_cbranch_execnz .LBB4_1
116-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
117-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
118-
; GFX940-NEXT: v_mov_b32_e32 v0, v2
11982
; GFX940-NEXT: s_setpc_b64 s[30:31]
12083
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
12184
ret float %ret

llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll

Lines changed: 2 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -56,25 +56,11 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
5656
; GFX940-LABEL: syncscope_system:
5757
; GFX940: ; %bb.0:
5858
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59-
; GFX940-NEXT: flat_load_dword v3, v[0:1]
60-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
61-
; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
62-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
63-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
64-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
65-
; GFX940-NEXT: v_add_f32_e32 v4, v5, v2
6659
; GFX940-NEXT: buffer_wbl2 sc0 sc1
6760
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
68-
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 sc1
61+
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
6962
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7063
; GFX940-NEXT: buffer_inv sc0 sc1
71-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
72-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
73-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
74-
; GFX940-NEXT: s_cbranch_execnz .LBB0_1
75-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
76-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
77-
; GFX940-NEXT: v_mov_b32_e32 v0, v3
7864
; GFX940-NEXT: s_setpc_b64 s[30:31]
7965
;
8066
; GFX1100-LABEL: syncscope_system:
@@ -373,23 +359,8 @@ define float @no_unsafe(ptr %addr, float %val) {
373359
; GFX940-LABEL: no_unsafe:
374360
; GFX940: ; %bb.0:
375361
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376-
; GFX940-NEXT: flat_load_dword v3, v[0:1]
377-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
378-
; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
379-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
380-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
381-
; GFX940-NEXT: v_mov_b32_e32 v5, v3
382-
; GFX940-NEXT: v_add_f32_e32 v4, v5, v2
383-
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
384-
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
362+
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
385363
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
386-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
387-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
388-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
389-
; GFX940-NEXT: s_cbranch_execnz .LBB3_1
390-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
391-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
392-
; GFX940-NEXT: v_mov_b32_e32 v0, v3
393364
; GFX940-NEXT: s_setpc_b64 s[30:31]
394365
;
395366
; GFX1100-LABEL: no_unsafe:

llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll

Lines changed: 6 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -28,26 +28,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
2828
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
2929
; GFX940: ; %bb.0:
3030
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
31-
; GFX940-NEXT: s_mov_b64 s[2:3], 0
31+
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
3232
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
3333
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
34-
; GFX940-NEXT: flat_load_dword v1, v[0:1]
35-
; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
36-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
37-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
38-
; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
39-
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
4034
; GFX940-NEXT: buffer_wbl2 sc0 sc1
4135
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
42-
; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
36+
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
4337
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4438
; GFX940-NEXT: buffer_inv sc0 sc1
45-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
46-
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
47-
; GFX940-NEXT: v_mov_b32_e32 v1, v0
48-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
49-
; GFX940-NEXT: s_cbranch_execnz .LBB1_1
50-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
5139
; GFX940-NEXT: s_endpgm
5240
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
5341
ret void
@@ -57,26 +45,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
5745
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
5846
; GFX940: ; %bb.0:
5947
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
60-
; GFX940-NEXT: s_mov_b64 s[2:3], 0
48+
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
6149
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
6250
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
63-
; GFX940-NEXT: flat_load_dword v1, v[0:1]
64-
; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
65-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
66-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
67-
; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
68-
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
6951
; GFX940-NEXT: buffer_wbl2 sc0 sc1
7052
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
71-
; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
53+
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
7254
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7355
; GFX940-NEXT: buffer_inv sc0 sc1
74-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
75-
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
76-
; GFX940-NEXT: v_mov_b32_e32 v1, v0
77-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
78-
; GFX940-NEXT: s_cbranch_execnz .LBB2_1
79-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
8056
; GFX940-NEXT: s_endpgm
8157
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
8258
ret void
@@ -97,25 +73,12 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
9773
; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
9874
; GFX940: ; %bb.0:
9975
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100-
; GFX940-NEXT: flat_load_dword v2, v[0:1]
101-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
102-
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
103-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
104-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
105-
; GFX940-NEXT: v_mov_b32_e32 v3, v2
106-
; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3
76+
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
10777
; GFX940-NEXT: buffer_wbl2 sc0 sc1
10878
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
109-
; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
79+
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
11080
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11181
; GFX940-NEXT: buffer_inv sc0 sc1
112-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
113-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
114-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
115-
; GFX940-NEXT: s_cbranch_execnz .LBB4_1
116-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
117-
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
118-
; GFX940-NEXT: v_mov_b32_e32 v0, v2
11982
; GFX940-NEXT: s_setpc_b64 s[30:31]
12083
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
12184
ret float %ret

0 commit comments

Comments
 (0)