@@ -228,6 +228,134 @@ entry:
228
228
ret void
229
229
}
230
230
231
+ define amdgpu_kernel void @rotr_v8i32 (ptr addrspace (1 ) %in , <8 x i32 > %x , <8 x i32 > %y ) {
232
+ ; R600-LABEL: rotr_v8i32:
233
+ ; R600: ; %bb.0: ; %entry
234
+ ; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
235
+ ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
236
+ ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
237
+ ; R600-NEXT: CF_END
238
+ ; R600-NEXT: ALU clause starting at 4:
239
+ ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[5].X, KC0[5].X, KC0[7].X,
240
+ ; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[4].W, KC0[4].W, KC0[6].W,
241
+ ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[4].Z, KC0[4].Z, KC0[6].Z,
242
+ ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[4].Y, KC0[4].Y, KC0[6].Y,
243
+ ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
244
+ ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
245
+ ; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[6].X, KC0[6].X, KC0[8].X,
246
+ ; R600-NEXT: BIT_ALIGN_INT * T2.Z, KC0[5].W, KC0[5].W, KC0[7].W,
247
+ ; R600-NEXT: BIT_ALIGN_INT * T2.Y, KC0[5].Z, KC0[5].Z, KC0[7].Z,
248
+ ; R600-NEXT: BIT_ALIGN_INT * T2.X, KC0[5].Y, KC0[5].Y, KC0[7].Y,
249
+ ; R600-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
250
+ ; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
251
+ ; R600-NEXT: LSHR * T3.X, PV.W, literal.x,
252
+ ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
253
+ ;
254
+ ; SI-LABEL: rotr_v8i32:
255
+ ; SI: ; %bb.0: ; %entry
256
+ ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11
257
+ ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
258
+ ; SI-NEXT: s_mov_b32 s3, 0xf000
259
+ ; SI-NEXT: s_mov_b32 s2, -1
260
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
261
+ ; SI-NEXT: v_mov_b32_e32 v0, s19
262
+ ; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
263
+ ; SI-NEXT: v_mov_b32_e32 v0, s18
264
+ ; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
265
+ ; SI-NEXT: v_mov_b32_e32 v0, s17
266
+ ; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
267
+ ; SI-NEXT: v_mov_b32_e32 v0, s16
268
+ ; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
269
+ ; SI-NEXT: v_mov_b32_e32 v4, s23
270
+ ; SI-NEXT: v_alignbit_b32 v7, s15, s15, v4
271
+ ; SI-NEXT: v_mov_b32_e32 v4, s22
272
+ ; SI-NEXT: v_alignbit_b32 v6, s14, s14, v4
273
+ ; SI-NEXT: v_mov_b32_e32 v4, s21
274
+ ; SI-NEXT: v_alignbit_b32 v5, s13, s13, v4
275
+ ; SI-NEXT: v_mov_b32_e32 v4, s20
276
+ ; SI-NEXT: v_alignbit_b32 v4, s12, s12, v4
277
+ ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
278
+ ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
279
+ ; SI-NEXT: s_endpgm
280
+ ;
281
+ ; GFX8-LABEL: rotr_v8i32:
282
+ ; GFX8: ; %bb.0: ; %entry
283
+ ; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44
284
+ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
285
+ ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
286
+ ; GFX8-NEXT: v_mov_b32_e32 v1, s18
287
+ ; GFX8-NEXT: v_mov_b32_e32 v4, s17
288
+ ; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1
289
+ ; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4
290
+ ; GFX8-NEXT: v_mov_b32_e32 v4, s23
291
+ ; GFX8-NEXT: v_alignbit_b32 v7, s15, s15, v4
292
+ ; GFX8-NEXT: v_mov_b32_e32 v4, s22
293
+ ; GFX8-NEXT: s_add_u32 s2, s0, 16
294
+ ; GFX8-NEXT: v_alignbit_b32 v6, s14, s14, v4
295
+ ; GFX8-NEXT: v_mov_b32_e32 v4, s21
296
+ ; GFX8-NEXT: s_addc_u32 s3, s1, 0
297
+ ; GFX8-NEXT: v_alignbit_b32 v5, s13, s13, v4
298
+ ; GFX8-NEXT: v_mov_b32_e32 v4, s20
299
+ ; GFX8-NEXT: v_mov_b32_e32 v9, s3
300
+ ; GFX8-NEXT: v_mov_b32_e32 v0, s19
301
+ ; GFX8-NEXT: v_alignbit_b32 v4, s12, s12, v4
302
+ ; GFX8-NEXT: v_mov_b32_e32 v8, s2
303
+ ; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0
304
+ ; GFX8-NEXT: v_mov_b32_e32 v0, s16
305
+ ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
306
+ ; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
307
+ ; GFX8-NEXT: v_mov_b32_e32 v5, s1
308
+ ; GFX8-NEXT: v_mov_b32_e32 v4, s0
309
+ ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
310
+ ; GFX8-NEXT: s_endpgm
311
+ ;
312
+ ; GFX10-LABEL: rotr_v8i32:
313
+ ; GFX10: ; %bb.0: ; %entry
314
+ ; GFX10-NEXT: s_clause 0x1
315
+ ; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44
316
+ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
317
+ ; GFX10-NEXT: v_mov_b32_e32 v8, 0
318
+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
319
+ ; GFX10-NEXT: v_alignbit_b32 v7, s15, s15, s23
320
+ ; GFX10-NEXT: v_alignbit_b32 v6, s14, s14, s22
321
+ ; GFX10-NEXT: v_alignbit_b32 v5, s13, s13, s21
322
+ ; GFX10-NEXT: v_alignbit_b32 v4, s12, s12, s20
323
+ ; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s19
324
+ ; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s18
325
+ ; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s17
326
+ ; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s16
327
+ ; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
328
+ ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
329
+ ; GFX10-NEXT: s_endpgm
330
+ ;
331
+ ; GFX11-LABEL: rotr_v8i32:
332
+ ; GFX11: ; %bb.0: ; %entry
333
+ ; GFX11-NEXT: s_clause 0x1
334
+ ; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44
335
+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
336
+ ; GFX11-NEXT: v_mov_b32_e32 v8, 0
337
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
338
+ ; GFX11-NEXT: v_alignbit_b32 v7, s15, s15, s23
339
+ ; GFX11-NEXT: v_alignbit_b32 v6, s14, s14, s22
340
+ ; GFX11-NEXT: v_alignbit_b32 v5, s13, s13, s21
341
+ ; GFX11-NEXT: v_alignbit_b32 v4, s12, s12, s20
342
+ ; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s19
343
+ ; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s18
344
+ ; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s17
345
+ ; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s16
346
+ ; GFX11-NEXT: s_clause 0x1
347
+ ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
348
+ ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
349
+ ; GFX11-NEXT: s_endpgm
350
+ entry:
351
+ %tmp0 = sub <8 x i32 > <i32 32 , i32 32 , i32 32 , i32 32 , i32 32 , i32 32 , i32 32 , i32 32 >, %y
352
+ %tmp1 = shl <8 x i32 > %x , %tmp0
353
+ %tmp2 = lshr <8 x i32 > %x , %y
354
+ %tmp3 = or <8 x i32 > %tmp1 , %tmp2
355
+ store <8 x i32 > %tmp3 , ptr addrspace (1 ) %in
356
+ ret void
357
+ }
358
+
231
359
declare i16 @llvm.fshr.i16 (i16 , i16 , i16 )
232
360
233
361
define void @test_rotr_i16 (ptr addrspace (1 ) nocapture readonly %sourceA , ptr addrspace (1 ) nocapture readonly %sourceB , ptr addrspace (1 ) nocapture %destValues ) {
0 commit comments