@@ -113,7 +113,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
113
113
; CHECK-NEXT: v_mov_b32_e32 v42, v0
114
114
; CHECK-NEXT: s_mov_b32 s48, exec_lo
115
115
; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42
116
- ; CHECK-NEXT: s_cbranch_execz .LBB0_25
116
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_24
117
117
; CHECK-NEXT: ; %bb.1: ; %.preheader5
118
118
; CHECK-NEXT: v_mul_lo_u32 v0, v40, 14
119
119
; CHECK-NEXT: s_mov_b32 s4, 0
@@ -133,58 +133,75 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
133
133
; CHECK-NEXT: s_mov_b32 s49, 0
134
134
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45
135
135
; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo
136
- ; CHECK-NEXT: s_cbranch_execz .LBB0_25
136
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_24
137
137
; CHECK-NEXT: ; %bb.4:
138
138
; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43
139
139
; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0
140
140
; CHECK-NEXT: v_mov_b32_e32 v47, 0
141
+ ; CHECK-NEXT: s_mov_b32 s55, 0
141
142
; CHECK-NEXT: s_getpc_b64 s[42:43]
142
143
; CHECK-NEXT: s_add_u32 s42, s42, _Z10atomic_incPU3AS3Vj@rel32@lo+4
143
144
; CHECK-NEXT: s_addc_u32 s43, s43, _Z10atomic_incPU3AS3Vj@rel32@hi+12
144
- ; CHECK-NEXT: s_mov_b32 s55, 0
145
- ; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1
146
- ; CHECK-NEXT: ; Child Loop BB0_8 Depth 2
147
- ; CHECK-NEXT: ; Child Loop BB0_20 Depth 2
145
+ ; CHECK-NEXT: s_branch .LBB0_7
146
+ ; CHECK-NEXT: .LBB0_5: ; %Flow37
147
+ ; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
148
+ ; CHECK-NEXT: s_inst_prefetch 0x2
149
+ ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56
150
+ ; CHECK-NEXT: .LBB0_6: ; %Flow38
151
+ ; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
152
+ ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
153
+ ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45
154
+ ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47
155
+ ; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
156
+ ; CHECK-NEXT: s_mov_b32 s55, s54
157
+ ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
158
+ ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
159
+ ; CHECK-NEXT: s_or_b32 s49, s4, s49
160
+ ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s49
161
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_24
162
+ ; CHECK-NEXT: .LBB0_7: ; =>This Loop Header: Depth=1
163
+ ; CHECK-NEXT: ; Child Loop BB0_10 Depth 2
164
+ ; CHECK-NEXT: ; Child Loop BB0_22 Depth 2
148
165
; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44
149
166
; CHECK-NEXT: s_lshl_b32 s4, s55, 5
150
167
; CHECK-NEXT: s_add_i32 s54, s55, 1
151
168
; CHECK-NEXT: s_add_i32 s5, s55, 5
152
169
; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54
153
170
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
154
171
; CHECK-NEXT: ds_read_u8 v56, v0
155
- ; CHECK-NEXT: v_mov_b32_e32 v59 , s54
172
+ ; CHECK-NEXT: v_mov_b32_e32 v58 , s54
156
173
; CHECK-NEXT: s_mov_b32 s56, exec_lo
157
174
; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42
158
- ; CHECK-NEXT: s_cbranch_execz .LBB0_17
159
- ; CHECK-NEXT: ; %bb.6: ; %.preheader2
160
- ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
161
- ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
162
- ; CHECK-NEXT: v_and_b32_e32 v58, 0xff, v56
175
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_19
176
+ ; CHECK-NEXT: ; %bb.8: ; %.preheader2
177
+ ; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
163
178
; CHECK-NEXT: s_mov_b32 s57, 0
164
179
; CHECK-NEXT: s_mov_b32 s58, 0
165
- ; CHECK-NEXT: s_branch .LBB0_8
166
- ; CHECK-NEXT: .LBB0_7 : ; in Loop: Header=BB0_8 Depth=2
180
+ ; CHECK-NEXT: s_branch .LBB0_10
181
+ ; CHECK-NEXT: .LBB0_9 : ; in Loop: Header=BB0_10 Depth=2
167
182
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
168
183
; CHECK-NEXT: s_add_i32 s58, s58, 4
169
184
; CHECK-NEXT: s_add_i32 s4, s55, s58
170
185
; CHECK-NEXT: v_add_nc_u32_e32 v0, s58, v57
171
186
; CHECK-NEXT: s_add_i32 s5, s4, 5
172
187
; CHECK-NEXT: s_add_i32 s4, s4, 1
173
188
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42
174
- ; CHECK-NEXT: v_mov_b32_e32 v59 , s4
189
+ ; CHECK-NEXT: v_mov_b32_e32 v58 , s4
175
190
; CHECK-NEXT: s_or_b32 s57, vcc_lo, s57
176
191
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s57
177
- ; CHECK-NEXT: s_cbranch_execz .LBB0_16
178
- ; CHECK-NEXT: .LBB0_8 : ; Parent Loop BB0_5 Depth=1
192
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_18
193
+ ; CHECK-NEXT: .LBB0_10 : ; Parent Loop BB0_7 Depth=1
179
194
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
180
- ; CHECK-NEXT: v_add_nc_u32_e32 v60, s58, v46
181
- ; CHECK-NEXT: v_add_nc_u32_e32 v59, s58, v57
195
+ ; CHECK-NEXT: v_add_nc_u32_e32 v59, s58, v46
196
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
197
+ ; CHECK-NEXT: v_and_b32_e32 v60, 0xff, v56
198
+ ; CHECK-NEXT: v_add_nc_u32_e32 v58, s58, v57
182
199
; CHECK-NEXT: s_mov_b32 s59, exec_lo
183
- ; CHECK-NEXT: ds_read_u8 v0, v60
200
+ ; CHECK-NEXT: ds_read_u8 v0, v59
184
201
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
185
- ; CHECK-NEXT: v_cmpx_eq_u16_e64 v58 , v0
186
- ; CHECK-NEXT: s_cbranch_execz .LBB0_10
187
- ; CHECK-NEXT: ; %bb.9 : ; in Loop: Header=BB0_8 Depth=2
202
+ ; CHECK-NEXT: v_cmpx_eq_u16_e64 v60 , v0
203
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_12
204
+ ; CHECK-NEXT: ; %bb.11 : ; in Loop: Header=BB0_10 Depth=2
188
205
; CHECK-NEXT: v_mov_b32_e32 v31, v41
189
206
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
190
207
; CHECK-NEXT: s_add_u32 s8, s34, 40
@@ -197,15 +214,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
197
214
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
198
215
; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
199
216
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
200
- ; CHECK-NEXT: ds_write_b32 v0, v59
201
- ; CHECK-NEXT: .LBB0_10 : ; in Loop: Header=BB0_8 Depth=2
217
+ ; CHECK-NEXT: ds_write_b32 v0, v58
218
+ ; CHECK-NEXT: .LBB0_12 : ; in Loop: Header=BB0_10 Depth=2
202
219
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
203
- ; CHECK-NEXT: ds_read_u8 v0, v60 offset:1
220
+ ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1
204
221
; CHECK-NEXT: s_mov_b32 s59, exec_lo
205
222
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
206
- ; CHECK-NEXT: v_cmpx_eq_u16_e64 v58 , v0
207
- ; CHECK-NEXT: s_cbranch_execz .LBB0_12
208
- ; CHECK-NEXT: ; %bb.11 : ; in Loop: Header=BB0_8 Depth=2
223
+ ; CHECK-NEXT: v_cmpx_eq_u16_e64 v60 , v0
224
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_14
225
+ ; CHECK-NEXT: ; %bb.13 : ; in Loop: Header=BB0_10 Depth=2
209
226
; CHECK-NEXT: v_mov_b32_e32 v31, v41
210
227
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
211
228
; CHECK-NEXT: s_add_u32 s8, s34, 40
@@ -215,19 +232,19 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
215
232
; CHECK-NEXT: s_mov_b32 s12, s41
216
233
; CHECK-NEXT: s_mov_b32 s13, s40
217
234
; CHECK-NEXT: s_mov_b32 s14, s33
218
- ; CHECK-NEXT: v_add_nc_u32_e32 v61, 1, v59
235
+ ; CHECK-NEXT: v_add_nc_u32_e32 v61, 1, v58
219
236
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
220
237
; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
221
238
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
222
239
; CHECK-NEXT: ds_write_b32 v0, v61
223
- ; CHECK-NEXT: .LBB0_12 : ; in Loop: Header=BB0_8 Depth=2
240
+ ; CHECK-NEXT: .LBB0_14 : ; in Loop: Header=BB0_10 Depth=2
224
241
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
225
- ; CHECK-NEXT: ds_read_u8 v0, v60 offset:2
242
+ ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2
226
243
; CHECK-NEXT: s_mov_b32 s59, exec_lo
227
244
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
228
- ; CHECK-NEXT: v_cmpx_eq_u16_e64 v58 , v0
229
- ; CHECK-NEXT: s_cbranch_execz .LBB0_14
230
- ; CHECK-NEXT: ; %bb.13 : ; in Loop: Header=BB0_8 Depth=2
245
+ ; CHECK-NEXT: v_cmpx_eq_u16_e64 v60 , v0
246
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_16
247
+ ; CHECK-NEXT: ; %bb.15 : ; in Loop: Header=BB0_10 Depth=2
231
248
; CHECK-NEXT: v_mov_b32_e32 v31, v41
232
249
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
233
250
; CHECK-NEXT: s_add_u32 s8, s34, 40
@@ -237,19 +254,19 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
237
254
; CHECK-NEXT: s_mov_b32 s12, s41
238
255
; CHECK-NEXT: s_mov_b32 s13, s40
239
256
; CHECK-NEXT: s_mov_b32 s14, s33
240
- ; CHECK-NEXT: v_add_nc_u32_e32 v61, 2, v59
257
+ ; CHECK-NEXT: v_add_nc_u32_e32 v61, 2, v58
241
258
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
242
259
; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
243
260
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
244
261
; CHECK-NEXT: ds_write_b32 v0, v61
245
- ; CHECK-NEXT: .LBB0_14 : ; in Loop: Header=BB0_8 Depth=2
262
+ ; CHECK-NEXT: .LBB0_16 : ; in Loop: Header=BB0_10 Depth=2
246
263
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
247
- ; CHECK-NEXT: ds_read_u8 v0, v60 offset:3
264
+ ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3
248
265
; CHECK-NEXT: s_mov_b32 s59, exec_lo
249
266
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
250
- ; CHECK-NEXT: v_cmpx_eq_u16_e64 v58 , v0
251
- ; CHECK-NEXT: s_cbranch_execz .LBB0_7
252
- ; CHECK-NEXT: ; %bb.15 : ; in Loop: Header=BB0_8 Depth=2
267
+ ; CHECK-NEXT: v_cmpx_eq_u16_e64 v60 , v0
268
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_9
269
+ ; CHECK-NEXT: ; %bb.17 : ; in Loop: Header=BB0_10 Depth=2
253
270
; CHECK-NEXT: v_mov_b32_e32 v31, v41
254
271
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
255
272
; CHECK-NEXT: s_add_u32 s8, s34, 40
@@ -259,45 +276,45 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
259
276
; CHECK-NEXT: s_mov_b32 s12, s41
260
277
; CHECK-NEXT: s_mov_b32 s13, s40
261
278
; CHECK-NEXT: s_mov_b32 s14, s33
262
- ; CHECK-NEXT: v_add_nc_u32_e32 v59 , 3, v59
279
+ ; CHECK-NEXT: v_add_nc_u32_e32 v58 , 3, v58
263
280
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
264
281
; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
265
282
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
266
- ; CHECK-NEXT: ds_write_b32 v0, v59
267
- ; CHECK-NEXT: s_branch .LBB0_7
268
- ; CHECK-NEXT: .LBB0_16 : ; %Flow43
269
- ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
283
+ ; CHECK-NEXT: ds_write_b32 v0, v58
284
+ ; CHECK-NEXT: s_branch .LBB0_9
285
+ ; CHECK-NEXT: .LBB0_18 : ; %Flow39
286
+ ; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
270
287
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
271
288
; CHECK-NEXT: v_mov_b32_e32 v57, v0
272
- ; CHECK-NEXT: .LBB0_17 : ; %Flow44
273
- ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
289
+ ; CHECK-NEXT: .LBB0_19 : ; %Flow40
290
+ ; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
274
291
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56
275
292
; CHECK-NEXT: s_mov_b32 s55, exec_lo
276
- ; CHECK-NEXT: v_cmpx_lt_u32_e64 v59 , v42
277
- ; CHECK-NEXT: s_cbranch_execz .LBB0_23
278
- ; CHECK-NEXT: ; %bb.18 : ; %.preheader
279
- ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
293
+ ; CHECK-NEXT: v_cmpx_lt_u32_e64 v58 , v42
294
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_6
295
+ ; CHECK-NEXT: ; %bb.20 : ; %.preheader
296
+ ; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
280
297
; CHECK-NEXT: s_mov_b32 s56, 0
281
298
; CHECK-NEXT: s_inst_prefetch 0x1
282
- ; CHECK-NEXT: s_branch .LBB0_20
299
+ ; CHECK-NEXT: s_branch .LBB0_22
283
300
; CHECK-NEXT: .p2align 6
284
- ; CHECK-NEXT: .LBB0_19 : ; in Loop: Header=BB0_20 Depth=2
301
+ ; CHECK-NEXT: .LBB0_21 : ; in Loop: Header=BB0_22 Depth=2
285
302
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
286
- ; CHECK-NEXT: v_add_nc_u32_e32 v59 , 1, v59
303
+ ; CHECK-NEXT: v_add_nc_u32_e32 v58 , 1, v58
287
304
; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57
288
- ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v59 , v42
305
+ ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v58 , v42
289
306
; CHECK-NEXT: s_or_b32 s56, vcc_lo, s56
290
307
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s56
291
- ; CHECK-NEXT: s_cbranch_execz .LBB0_22
292
- ; CHECK-NEXT: .LBB0_20 : ; Parent Loop BB0_5 Depth=1
308
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_5
309
+ ; CHECK-NEXT: .LBB0_22 : ; Parent Loop BB0_7 Depth=1
293
310
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
294
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v59
311
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58
295
312
; CHECK-NEXT: ds_read_u8 v0, v0
296
313
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
297
314
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
298
315
; CHECK-NEXT: s_and_saveexec_b32 s57, s4
299
- ; CHECK-NEXT: s_cbranch_execz .LBB0_19
300
- ; CHECK-NEXT: ; %bb.21 : ; in Loop: Header=BB0_20 Depth=2
316
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_21
317
+ ; CHECK-NEXT: ; %bb.23 : ; in Loop: Header=BB0_22 Depth=2
301
318
; CHECK-NEXT: v_mov_b32_e32 v31, v41
302
319
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
303
320
; CHECK-NEXT: s_add_u32 s8, s34, 40
@@ -311,25 +328,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
311
328
; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
312
329
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
313
330
; CHECK-NEXT: ds_write_b32 v0, v57
314
- ; CHECK-NEXT: s_branch .LBB0_19
315
- ; CHECK-NEXT: .LBB0_22: ; %Flow41
316
- ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
317
- ; CHECK-NEXT: s_inst_prefetch 0x2
318
- ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56
319
- ; CHECK-NEXT: .LBB0_23: ; %Flow42
320
- ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
321
- ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
322
- ; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
323
- ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45
324
- ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47
325
- ; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
326
- ; CHECK-NEXT: s_mov_b32 s55, s54
327
- ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
328
- ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
329
- ; CHECK-NEXT: s_or_b32 s49, s4, s49
330
- ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s49
331
- ; CHECK-NEXT: s_cbranch_execnz .LBB0_5
332
- ; CHECK-NEXT: .LBB0_25: ; %Flow49
331
+ ; CHECK-NEXT: s_branch .LBB0_21
332
+ ; CHECK-NEXT: .LBB0_24: ; %Flow45
333
333
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48
334
334
; CHECK-NEXT: v_mov_b32_e32 v31, v41
335
335
; CHECK-NEXT: v_mov_b32_e32 v0, 1
@@ -346,8 +346,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
346
346
; CHECK-NEXT: ds_read_b32 v47, v0 offset:15360
347
347
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
348
348
; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v40
349
- ; CHECK-NEXT: s_cbranch_execz .LBB0_33
350
- ; CHECK-NEXT: ; %bb.26 :
349
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_32
350
+ ; CHECK-NEXT: ; %bb.25 :
351
351
; CHECK-NEXT: s_add_u32 s52, s44, 8
352
352
; CHECK-NEXT: s_addc_u32 s53, s45, 0
353
353
; CHECK-NEXT: s_getpc_b64 s[42:43]
@@ -360,8 +360,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
360
360
; CHECK-NEXT: s_getpc_b64 s[48:49]
361
361
; CHECK-NEXT: s_add_u32 s48, s48, _Z14get_local_sizej@rel32@lo+4
362
362
; CHECK-NEXT: s_addc_u32 s49, s49, _Z14get_local_sizej@rel32@hi+12
363
- ; CHECK-NEXT: s_branch .LBB0_28
364
- ; CHECK-NEXT: .LBB0_27 : ; in Loop: Header=BB0_28 Depth=1
363
+ ; CHECK-NEXT: s_branch .LBB0_27
364
+ ; CHECK-NEXT: .LBB0_26 : ; in Loop: Header=BB0_27 Depth=1
365
365
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
366
366
; CHECK-NEXT: v_mov_b32_e32 v31, v41
367
367
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -377,8 +377,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
377
377
; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v40
378
378
; CHECK-NEXT: s_or_b32 s54, vcc_lo, s54
379
379
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s54
380
- ; CHECK-NEXT: s_cbranch_execz .LBB0_33
381
- ; CHECK-NEXT: .LBB0_28 : ; =>This Inner Loop Header: Depth=1
380
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_32
381
+ ; CHECK-NEXT: .LBB0_27 : ; =>This Inner Loop Header: Depth=1
382
382
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v40
383
383
; CHECK-NEXT: s_mov_b32 s55, exec_lo
384
384
; CHECK-NEXT: ds_read_b32 v0, v0
@@ -406,8 +406,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
406
406
; CHECK-NEXT: v_or_b32_e32 v5, v46, v57
407
407
; CHECK-NEXT: v_or_b32_e32 v4, v45, v56
408
408
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5]
409
- ; CHECK-NEXT: s_cbranch_execz .LBB0_27
410
- ; CHECK-NEXT: ; %bb.29 : ; in Loop: Header=BB0_28 Depth=1
409
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_26
410
+ ; CHECK-NEXT: ; %bb.28 : ; in Loop: Header=BB0_27 Depth=1
411
411
; CHECK-NEXT: s_clause 0x1
412
412
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:16
413
413
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:16
@@ -443,8 +443,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
443
443
; CHECK-NEXT: s_mov_b32 s4, exec_lo
444
444
; CHECK-NEXT: v_cmpx_gt_u32_e32 12, v0
445
445
; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4
446
- ; CHECK-NEXT: s_cbranch_execz .LBB0_31
447
- ; CHECK-NEXT: ; %bb.30 : ; in Loop: Header=BB0_28 Depth=1
446
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_30
447
+ ; CHECK-NEXT: ; %bb.29 : ; in Loop: Header=BB0_27 Depth=1
448
448
; CHECK-NEXT: v_xor_b32_e32 v5, v60, v58
449
449
; CHECK-NEXT: v_lshrrev_b64 v[3:4], 16, v[56:57]
450
450
; CHECK-NEXT: v_mul_u32_u24_e32 v11, 0x180, v73
@@ -469,11 +469,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
469
469
; CHECK-NEXT: ; implicit-def: $vgpr42
470
470
; CHECK-NEXT: ; implicit-def: $vgpr43
471
471
; CHECK-NEXT: ; implicit-def: $vgpr44
472
- ; CHECK-NEXT: .LBB0_31 : ; %Flow
473
- ; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1
472
+ ; CHECK-NEXT: .LBB0_30 : ; %Flow
473
+ ; CHECK-NEXT: ; in Loop: Header=BB0_27 Depth=1
474
474
; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4
475
- ; CHECK-NEXT: s_cbranch_execz .LBB0_27
476
- ; CHECK-NEXT: ; %bb.32 : ; in Loop: Header=BB0_28 Depth=1
475
+ ; CHECK-NEXT: s_cbranch_execz .LBB0_26
476
+ ; CHECK-NEXT: ; %bb.31 : ; in Loop: Header=BB0_27 Depth=1
477
477
; CHECK-NEXT: v_mov_b32_e32 v31, v41
478
478
; CHECK-NEXT: v_mov_b32_e32 v0, v42
479
479
; CHECK-NEXT: v_mov_b32_e32 v1, v43
@@ -486,8 +486,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
486
486
; CHECK-NEXT: s_mov_b32 s13, s40
487
487
; CHECK-NEXT: s_mov_b32 s14, s33
488
488
; CHECK-NEXT: s_swappc_b64 s[30:31], s[44:45]
489
- ; CHECK-NEXT: s_branch .LBB0_27
490
- ; CHECK-NEXT: .LBB0_33 :
489
+ ; CHECK-NEXT: s_branch .LBB0_26
490
+ ; CHECK-NEXT: .LBB0_32 :
491
491
; CHECK-NEXT: s_endpgm
492
492
%6 = tail call i64 @_Z13get_global_idj (i32 noundef 0 ) #4
493
493
%7 = trunc i64 %6 to i32
@@ -878,11 +878,11 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
878
878
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
879
879
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
880
880
; CHECK-NEXT: s_cbranch_execnz .LBB1_3
881
- ; CHECK-NEXT: ; %bb.4: ; %Flow3
881
+ ; CHECK-NEXT: ; %bb.4: ; %Flow2
882
882
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
883
883
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
884
884
; CHECK-NEXT: v_mov_b32_e32 v47, v0
885
- ; CHECK-NEXT: .LBB1_5: ; %Flow4
885
+ ; CHECK-NEXT: .LBB1_5: ; %Flow3
886
886
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
887
887
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
888
888
; CHECK-NEXT: s_mov_b32 s48, exec_lo
@@ -932,7 +932,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
932
932
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
933
933
; CHECK-NEXT: s_inst_prefetch 0x2
934
934
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49
935
- ; CHECK-NEXT: .LBB1_11: ; %Flow2
935
+ ; CHECK-NEXT: .LBB1_11: ; %Flow1
936
936
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
937
937
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48
938
938
; CHECK-NEXT: ; %bb.12: ; %.32
0 commit comments