@@ -151,8 +151,8 @@ define protected amdgpu_kernel void @nand(i32 addrspace(1)* %p, %S addrspace(1)*
151
151
ret void
152
152
}
153
153
154
- define protected amdgpu_kernel void @max (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
155
- ; CHECK-LABEL: max :
154
+ define protected amdgpu_kernel void @max_workgroup (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
155
+ ; CHECK-LABEL: max_workgroup :
156
156
; CHECK: ; %bb.0:
157
157
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
158
158
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -165,6 +165,41 @@ define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)*
165
165
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
166
166
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
167
167
; CHECK-NEXT: global_store_dword v[0:1], v2, off
168
+ ; CHECK-NEXT: s_endpgm
169
+ %n32 = atomicrmw max i32 addrspace (1 )* %p , i32 1 syncscope("workgroup" ) monotonic
170
+ %n64 = zext i32 %n32 to i64
171
+ %p1 = getelementptr inbounds %S , %S addrspace (1 )* %q , i64 %n64 , i32 0
172
+ store float 1 .0 , float addrspace (1 )* %p1
173
+ ret void
174
+ }
175
+
176
+ define protected amdgpu_kernel void @max (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
177
+ ; CHECK-LABEL: max:
178
+ ; CHECK: ; %bb.0:
179
+ ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
180
+ ; CHECK-NEXT: s_mov_b64 s[4:5], 0
181
+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
182
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
183
+ ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
184
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
185
+ ; CHECK-NEXT: v_mov_b32_e32 v0, s6
186
+ ; CHECK-NEXT: .LBB7_1: ; %atomicrmw.start
187
+ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
188
+ ; CHECK-NEXT: v_mov_b32_e32 v3, v0
189
+ ; CHECK-NEXT: v_max_i32_e32 v2, 1, v3
190
+ ; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
191
+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
192
+ ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
193
+ ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
194
+ ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
195
+ ; CHECK-NEXT: s_cbranch_execnz .LBB7_1
196
+ ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
197
+ ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
198
+ ; CHECK-NEXT: v_mov_b32_e32 v2, s2
199
+ ; CHECK-NEXT: v_mov_b32_e32 v3, s3
200
+ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
201
+ ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
202
+ ; CHECK-NEXT: global_store_dword v[0:1], v2, off
168
203
; CHECK-NEXT: s_endpgm
169
204
%n32 = atomicrmw max i32 addrspace (1 )* %p , i32 1 monotonic
170
205
%n64 = zext i32 %n32 to i64
@@ -173,8 +208,8 @@ define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)*
173
208
ret void
174
209
}
175
210
176
- define protected amdgpu_kernel void @min (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
177
- ; CHECK-LABEL: min :
211
+ define protected amdgpu_kernel void @min_workgroup (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
212
+ ; CHECK-LABEL: min_workgroup :
178
213
; CHECK: ; %bb.0:
179
214
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
180
215
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -187,6 +222,41 @@ define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)*
187
222
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
188
223
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
189
224
; CHECK-NEXT: global_store_dword v[0:1], v2, off
225
+ ; CHECK-NEXT: s_endpgm
226
+ %n32 = atomicrmw min i32 addrspace (1 )* %p , i32 1 syncscope("workgroup" ) monotonic
227
+ %n64 = zext i32 %n32 to i64
228
+ %p1 = getelementptr inbounds %S , %S addrspace (1 )* %q , i64 %n64 , i32 0
229
+ store float 1 .0 , float addrspace (1 )* %p1
230
+ ret void
231
+ }
232
+
233
+ define protected amdgpu_kernel void @min (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
234
+ ; CHECK-LABEL: min:
235
+ ; CHECK: ; %bb.0:
236
+ ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
237
+ ; CHECK-NEXT: s_mov_b64 s[4:5], 0
238
+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
239
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
240
+ ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
241
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
242
+ ; CHECK-NEXT: v_mov_b32_e32 v0, s6
243
+ ; CHECK-NEXT: .LBB9_1: ; %atomicrmw.start
244
+ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
245
+ ; CHECK-NEXT: v_mov_b32_e32 v3, v0
246
+ ; CHECK-NEXT: v_min_i32_e32 v2, 1, v3
247
+ ; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
248
+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
249
+ ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
250
+ ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
251
+ ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
252
+ ; CHECK-NEXT: s_cbranch_execnz .LBB9_1
253
+ ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
254
+ ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
255
+ ; CHECK-NEXT: v_mov_b32_e32 v2, s2
256
+ ; CHECK-NEXT: v_mov_b32_e32 v3, s3
257
+ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
258
+ ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
259
+ ; CHECK-NEXT: global_store_dword v[0:1], v2, off
190
260
; CHECK-NEXT: s_endpgm
191
261
%n32 = atomicrmw min i32 addrspace (1 )* %p , i32 1 monotonic
192
262
%n64 = zext i32 %n32 to i64
@@ -195,8 +265,8 @@ define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)*
195
265
ret void
196
266
}
197
267
198
- define protected amdgpu_kernel void @umax (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
199
- ; CHECK-LABEL: umax :
268
+ define protected amdgpu_kernel void @umax_workgroup (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
269
+ ; CHECK-LABEL: umax_workgroup :
200
270
; CHECK: ; %bb.0:
201
271
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
202
272
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -209,6 +279,41 @@ define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)*
209
279
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
210
280
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
211
281
; CHECK-NEXT: global_store_dword v[0:1], v2, off
282
+ ; CHECK-NEXT: s_endpgm
283
+ %n32 = atomicrmw umax i32 addrspace (1 )* %p , i32 1 syncscope("workgroup" ) monotonic
284
+ %n64 = zext i32 %n32 to i64
285
+ %p1 = getelementptr inbounds %S , %S addrspace (1 )* %q , i64 %n64 , i32 0
286
+ store float 1 .0 , float addrspace (1 )* %p1
287
+ ret void
288
+ }
289
+
290
+ define protected amdgpu_kernel void @umax (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
291
+ ; CHECK-LABEL: umax:
292
+ ; CHECK: ; %bb.0:
293
+ ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
294
+ ; CHECK-NEXT: s_mov_b64 s[4:5], 0
295
+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
296
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
297
+ ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
298
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
299
+ ; CHECK-NEXT: v_mov_b32_e32 v0, s6
300
+ ; CHECK-NEXT: .LBB11_1: ; %atomicrmw.start
301
+ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
302
+ ; CHECK-NEXT: v_mov_b32_e32 v3, v0
303
+ ; CHECK-NEXT: v_max_u32_e32 v2, 1, v3
304
+ ; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
305
+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
306
+ ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
307
+ ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
308
+ ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
309
+ ; CHECK-NEXT: s_cbranch_execnz .LBB11_1
310
+ ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
311
+ ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
312
+ ; CHECK-NEXT: v_mov_b32_e32 v2, s2
313
+ ; CHECK-NEXT: v_mov_b32_e32 v3, s3
314
+ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
315
+ ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
316
+ ; CHECK-NEXT: global_store_dword v[0:1], v2, off
212
317
; CHECK-NEXT: s_endpgm
213
318
%n32 = atomicrmw umax i32 addrspace (1 )* %p , i32 1 monotonic
214
319
%n64 = zext i32 %n32 to i64
@@ -217,8 +322,8 @@ define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)*
217
322
ret void
218
323
}
219
324
220
- define protected amdgpu_kernel void @umin (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
221
- ; CHECK-LABEL: umin :
325
+ define protected amdgpu_kernel void @umin_workgroup (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
326
+ ; CHECK-LABEL: umin_workgroup :
222
327
; CHECK: ; %bb.0:
223
328
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
224
329
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -231,6 +336,41 @@ define protected amdgpu_kernel void @umin(i32 addrspace(1)* %p, %S addrspace(1)*
231
336
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
232
337
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
233
338
; CHECK-NEXT: global_store_dword v[0:1], v2, off
339
+ ; CHECK-NEXT: s_endpgm
340
+ %n32 = atomicrmw umin i32 addrspace (1 )* %p , i32 1 syncscope("workgroup" ) monotonic
341
+ %n64 = zext i32 %n32 to i64
342
+ %p1 = getelementptr inbounds %S , %S addrspace (1 )* %q , i64 %n64 , i32 0
343
+ store float 1 .0 , float addrspace (1 )* %p1
344
+ ret void
345
+ }
346
+
347
+ define protected amdgpu_kernel void @umin (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
348
+ ; CHECK-LABEL: umin:
349
+ ; CHECK: ; %bb.0:
350
+ ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
351
+ ; CHECK-NEXT: s_mov_b64 s[4:5], 0
352
+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
353
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
354
+ ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
355
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
356
+ ; CHECK-NEXT: v_mov_b32_e32 v0, s6
357
+ ; CHECK-NEXT: .LBB13_1: ; %atomicrmw.start
358
+ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
359
+ ; CHECK-NEXT: v_mov_b32_e32 v3, v0
360
+ ; CHECK-NEXT: v_min_u32_e32 v2, 1, v3
361
+ ; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
362
+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
363
+ ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
364
+ ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
365
+ ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
366
+ ; CHECK-NEXT: s_cbranch_execnz .LBB13_1
367
+ ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
368
+ ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
369
+ ; CHECK-NEXT: v_mov_b32_e32 v2, s2
370
+ ; CHECK-NEXT: v_mov_b32_e32 v3, s3
371
+ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
372
+ ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
373
+ ; CHECK-NEXT: global_store_dword v[0:1], v2, off
234
374
; CHECK-NEXT: s_endpgm
235
375
%n32 = atomicrmw umin i32 addrspace (1 )* %p , i32 1 monotonic
236
376
%n64 = zext i32 %n32 to i64
@@ -337,7 +477,7 @@ define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1
337
477
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
338
478
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
339
479
; CHECK-NEXT: v_mov_b32_e32 v0, s6
340
- ; CHECK-NEXT: .LBB14_1 : ; %atomicrmw.start
480
+ ; CHECK-NEXT: .LBB18_1 : ; %atomicrmw.start
341
481
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
342
482
; CHECK-NEXT: v_mov_b32_e32 v3, v0
343
483
; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -346,7 +486,7 @@ define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1
346
486
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
347
487
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
348
488
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
349
- ; CHECK-NEXT: s_cbranch_execnz .LBB14_1
489
+ ; CHECK-NEXT: s_cbranch_execnz .LBB18_1
350
490
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
351
491
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
352
492
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
@@ -374,7 +514,7 @@ define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1
374
514
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
375
515
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
376
516
; CHECK-NEXT: v_mov_b32_e32 v0, s6
377
- ; CHECK-NEXT: .LBB15_1 : ; %atomicrmw.start
517
+ ; CHECK-NEXT: .LBB19_1 : ; %atomicrmw.start
378
518
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
379
519
; CHECK-NEXT: v_mov_b32_e32 v3, v0
380
520
; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3
@@ -383,7 +523,7 @@ define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1
383
523
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
384
524
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
385
525
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
386
- ; CHECK-NEXT: s_cbranch_execnz .LBB15_1
526
+ ; CHECK-NEXT: s_cbranch_execnz .LBB19_1
387
527
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
388
528
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
389
529
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
0 commit comments