@@ -28,26 +28,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
28
28
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
29
29
; GFX940: ; %bb.0:
30
30
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
31
- ; GFX940-NEXT: s_mov_b64 s[2:3], 0
31
+ ; GFX940-NEXT: v_mov_b32_e32 v2, 4. 0
32
32
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
33
33
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
34
- ; GFX940-NEXT: flat_load_dword v1, v[0:1]
35
- ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
36
- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
37
- ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
38
- ; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
39
- ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
40
34
; GFX940-NEXT: buffer_wbl2 sc0 sc1
41
35
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
42
- ; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[ 0:1] sc0 sc1
36
+ ; GFX940-NEXT: flat_atomic_add_f32 v[ 0:1], v2 sc1
43
37
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
44
38
; GFX940-NEXT: buffer_inv sc0 sc1
45
- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
46
- ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
47
- ; GFX940-NEXT: v_mov_b32_e32 v1, v0
48
- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
49
- ; GFX940-NEXT: s_cbranch_execnz .LBB1_1
50
- ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
51
39
; GFX940-NEXT: s_endpgm
52
40
%ret = atomicrmw fadd ptr %ptr , float 4 .0 seq_cst
53
41
ret void
@@ -57,26 +45,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
57
45
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
58
46
; GFX940: ; %bb.0:
59
47
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
60
- ; GFX940-NEXT: s_mov_b64 s[2:3], 0
48
+ ; GFX940-NEXT: v_mov_b32_e32 v2, 4. 0
61
49
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
62
50
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
63
- ; GFX940-NEXT: flat_load_dword v1, v[0:1]
64
- ; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
65
- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
66
- ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
67
- ; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
68
- ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
69
51
; GFX940-NEXT: buffer_wbl2 sc0 sc1
70
52
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
71
- ; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[ 0:1] sc0 sc1
53
+ ; GFX940-NEXT: flat_atomic_add_f32 v[ 0:1], v2 sc1
72
54
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
73
55
; GFX940-NEXT: buffer_inv sc0 sc1
74
- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
75
- ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
76
- ; GFX940-NEXT: v_mov_b32_e32 v1, v0
77
- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
78
- ; GFX940-NEXT: s_cbranch_execnz .LBB2_1
79
- ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
80
56
; GFX940-NEXT: s_endpgm
81
57
%ret = atomicrmw fadd ptr %ptr , float 4 .0 seq_cst
82
58
ret void
@@ -97,25 +73,12 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
97
73
; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
98
74
; GFX940: ; %bb.0:
99
75
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100
- ; GFX940-NEXT: flat_load_dword v2, v[0:1]
101
- ; GFX940-NEXT: s_mov_b64 s[0:1], 0
102
- ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
103
- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
104
- ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
105
- ; GFX940-NEXT: v_mov_b32_e32 v3, v2
106
- ; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3
76
+ ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
107
77
; GFX940-NEXT: buffer_wbl2 sc0 sc1
108
78
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
109
- ; GFX940-NEXT: flat_atomic_cmpswap v2 , v[0:1], v[2:3] sc0 sc1
79
+ ; GFX940-NEXT: flat_atomic_add_f32 v0 , v[0:1], v2 sc0 sc1
110
80
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
111
81
; GFX940-NEXT: buffer_inv sc0 sc1
112
- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
113
- ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
114
- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
115
- ; GFX940-NEXT: s_cbranch_execnz .LBB4_1
116
- ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
117
- ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
118
- ; GFX940-NEXT: v_mov_b32_e32 v0, v2
119
82
; GFX940-NEXT: s_setpc_b64 s[30:31]
120
83
%ret = atomicrmw fadd ptr %ptr , float 4 .0 seq_cst
121
84
ret float %ret
0 commit comments