1
1
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2
2
; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
3
- ; REQUIRES: do-not-run-me
4
3
5
4
; Divergent phis that don't require lowering using lane mask merging
6
5
@@ -66,15 +65,16 @@ exit:
66
65
define amdgpu_ps void @divergent_i1_phi_uniform_branch_simple (ptr addrspace (1 ) %out , i32 %tid , i32 inreg %cond ) {
67
66
; GFX10-LABEL: divergent_i1_phi_uniform_branch_simple:
68
67
; GFX10: ; %bb.0: ; %A
68
+ ; GFX10-NEXT: v_cmp_le_u32_e64 s1, 6, v2
69
69
; GFX10-NEXT: s_cmp_lg_u32 s0, 0
70
- ; GFX10-NEXT: s_cbranch_scc0 .LBB1_2
71
- ; GFX10-NEXT: ; %bb.1:
72
- ; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6 , v2
73
- ; GFX10-NEXT: s_branch .LBB1_3
74
- ; GFX10-NEXT: .LBB1_2: ; %B
75
- ; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 1, v2
76
- ; GFX10-NEXT: .LBB1_3 : ; %exit
77
- ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0
70
+ ; GFX10-NEXT: s_cbranch_scc1 .LBB1_2
71
+ ; GFX10-NEXT: ; %bb.1: ; %B
72
+ ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 1 , v2
73
+ ; GFX10-NEXT: s_andn2_b32 s0, s1, exec_lo
74
+ ; GFX10-NEXT: s_and_b32 s1, exec_lo, vcc_lo
75
+ ; GFX10-NEXT: s_or_b32 s1, s0, s1
76
+ ; GFX10-NEXT: .LBB1_2 : ; %exit
77
+ ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s1
78
78
; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2
79
79
; GFX10-NEXT: global_store_dword v[0:1], v2, off
80
80
; GFX10-NEXT: s_endpgm
@@ -101,23 +101,27 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
101
101
; GFX10-LABEL: divergent_i1_phi_used_inside_loop:
102
102
; GFX10: ; %bb.0: ; %entry
103
103
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
104
- ; GFX10-NEXT: s_mov_b32 s4 , 0
104
+ ; GFX10-NEXT: s_mov_b32 s5 , 0
105
105
; GFX10-NEXT: v_mov_b32_e32 v3, 1
106
- ; GFX10-NEXT: v_mov_b32_e32 v4, s4
106
+ ; GFX10-NEXT: v_mov_b32_e32 v4, s5
107
+ ; GFX10-NEXT: ; implicit-def: $sgpr6
107
108
; GFX10-NEXT: .LBB2_1: ; %loop
108
109
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
109
- ; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
110
110
; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
111
+ ; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
111
112
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
113
+ ; GFX10-NEXT: v_and_b32_e32 v6, 1, v3
112
114
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0
113
- ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
114
- ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
115
+ ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v6
116
+ ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
117
+ ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
118
+ ; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
119
+ ; GFX10-NEXT: s_or_b32 s6, s6, s4
120
+ ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
115
121
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
116
122
; GFX10-NEXT: ; %bb.2: ; %exit
117
- ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
118
- ; GFX10-NEXT: v_and_b32_e32 v0, 1, v3
119
- ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
120
- ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
123
+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
124
+ ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
121
125
; GFX10-NEXT: flat_store_dword v[1:2], v0
122
126
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
123
127
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -144,44 +148,49 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
144
148
; GFX10: ; %bb.0: ; %entry
145
149
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
146
150
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
147
- ; GFX10-NEXT: s_mov_b32 s4, 0
148
- ; GFX10-NEXT: v_mov_b32_e32 v8, 0x3e8
149
- ; GFX10-NEXT: v_mov_b32_e32 v9, s4
150
- ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
151
+ ; GFX10-NEXT: s_mov_b32 s5, 0
152
+ ; GFX10-NEXT: v_mov_b32_e32 v1, 0x3e8
153
+ ; GFX10-NEXT: v_mov_b32_e32 v8, s5
154
+ ; GFX10-NEXT: ; implicit-def: $sgpr6
155
+ ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
151
156
; GFX10-NEXT: s_branch .LBB3_2
152
157
; GFX10-NEXT: .LBB3_1: ; %loop_body
153
158
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
154
- ; GFX10-NEXT: v_cvt_f32_u32_e32 v10, v9
155
- ; GFX10-NEXT: v_xor_b32_e32 v1, 1, v1
156
- ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
157
- ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v0
158
- ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
159
- ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
159
+ ; GFX10-NEXT: v_cvt_f32_u32_e32 v9, v8
160
+ ; GFX10-NEXT: s_xor_b32 s4, s4, -1
161
+ ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8
162
+ ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v0
163
+ ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
164
+ ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
165
+ ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
166
+ ; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
167
+ ; GFX10-NEXT: s_or_b32 s6, s6, s4
168
+ ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
160
169
; GFX10-NEXT: s_cbranch_execz .LBB3_6
161
170
; GFX10-NEXT: .LBB3_2: ; %loop_start
162
171
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
163
- ; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v9
164
- ; GFX10-NEXT: s_mov_b32 s5, 1
172
+ ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
173
+ ; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
174
+ ; GFX10-NEXT: s_mov_b32 s7, 1
175
+ ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v9
165
176
; GFX10-NEXT: s_cbranch_vccz .LBB3_4
166
177
; GFX10-NEXT: ; %bb.3: ; %else
167
178
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
168
- ; GFX10-NEXT: s_mov_b32 s5 , 0
169
- ; GFX10-NEXT: flat_store_dword v[6:7], v8
179
+ ; GFX10-NEXT: s_mov_b32 s7 , 0
180
+ ; GFX10-NEXT: flat_store_dword v[6:7], v1
170
181
; GFX10-NEXT: .LBB3_4: ; %Flow
171
182
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
172
- ; GFX10-NEXT: s_xor_b32 s5, s5 , 1
173
- ; GFX10-NEXT: s_and_b32 s5, s5 , 1
174
- ; GFX10-NEXT: s_cmp_lg_u32 s5 , 0
183
+ ; GFX10-NEXT: s_xor_b32 s7, s7 , 1
184
+ ; GFX10-NEXT: s_and_b32 s7, s7 , 1
185
+ ; GFX10-NEXT: s_cmp_lg_u32 s7 , 0
175
186
; GFX10-NEXT: s_cbranch_scc1 .LBB3_1
176
187
; GFX10-NEXT: ; %bb.5: ; %if
177
188
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
178
- ; GFX10-NEXT: flat_store_dword v[4:5], v8
189
+ ; GFX10-NEXT: flat_store_dword v[4:5], v1
179
190
; GFX10-NEXT: s_branch .LBB3_1
180
191
; GFX10-NEXT: .LBB3_6: ; %exit
181
- ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
182
- ; GFX10-NEXT: v_and_b32_e32 v0, 1, v1
183
- ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
184
- ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
192
+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
193
+ ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
185
194
; GFX10-NEXT: flat_store_dword v[2:3], v0
186
195
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
187
196
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -221,16 +230,15 @@ exit:
221
230
define amdgpu_cs void @single_lane_execution_attribute (i32 inreg %.userdata0 , <3 x i32 > inreg %.WorkgroupId , <3 x i32 > %.LocalInvocationId ) #0 {
222
231
; GFX10-LABEL: single_lane_execution_attribute:
223
232
; GFX10: ; %bb.0: ; %.entry
224
- ; GFX10-NEXT: s_mov_b32 s12, 0
225
233
; GFX10-NEXT: s_getpc_b64 s[4:5]
234
+ ; GFX10-NEXT: s_mov_b32 s12, 0
226
235
; GFX10-NEXT: s_mov_b32 s13, -1
227
236
; GFX10-NEXT: s_mov_b32 s2, s0
228
237
; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[12:13]
229
238
; GFX10-NEXT: s_mov_b32 s3, s12
230
239
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0
231
240
; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
232
241
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
233
- ; GFX10-NEXT: s_mov_b32 s2, 1
234
242
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
235
243
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
236
244
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
@@ -257,13 +265,12 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
257
265
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
258
266
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
259
267
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
268
+ ; GFX10-NEXT: s_mov_b32 s13, 0
260
269
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
261
270
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
262
- ; GFX10-NEXT: s_mov_b32 s2, 0
263
271
; GFX10-NEXT: .LBB4_4: ; %Flow
264
- ; GFX10-NEXT: s_and_b32 s2, s2, 1
265
- ; GFX10-NEXT: s_cmp_lg_u32 s2, 0
266
- ; GFX10-NEXT: s_cbranch_scc0 .LBB4_6
272
+ ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s13
273
+ ; GFX10-NEXT: s_cbranch_vccz .LBB4_6
267
274
; GFX10-NEXT: ; %bb.5: ; %.19
268
275
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
269
276
; GFX10-NEXT: v_or_b32_e32 v3, 2, v1
0 commit comments