@@ -97,10 +97,8 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
97
97
; SI-NEXT: s_mov_b32 s7, 0xf000
98
98
; SI-NEXT: s_mov_b32 s6, -1
99
99
; SI-NEXT: s_waitcnt lgkmcnt(0)
100
- ; SI-NEXT: s_lshr_b32 s1, s1, 25
101
- ; SI-NEXT: s_lshl_b32 s0, s0, 7
102
- ; SI-NEXT: s_or_b32 s0, s0, s1
103
- ; SI-NEXT: v_mov_b32_e32 v0, s0
100
+ ; SI-NEXT: v_mov_b32_e32 v0, s1
101
+ ; SI-NEXT: v_alignbit_b32 v0, s0, v0, 25
104
102
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
105
103
; SI-NEXT: s_endpgm
106
104
;
@@ -109,12 +107,10 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
109
107
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
110
108
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
111
109
; VI-NEXT: s_waitcnt lgkmcnt(0)
110
+ ; VI-NEXT: v_mov_b32_e32 v0, s1
111
+ ; VI-NEXT: v_alignbit_b32 v2, s0, v0, 25
112
112
; VI-NEXT: v_mov_b32_e32 v0, s2
113
- ; VI-NEXT: s_lshr_b32 s1, s1, 25
114
- ; VI-NEXT: s_lshl_b32 s0, s0, 7
115
- ; VI-NEXT: s_or_b32 s0, s0, s1
116
113
; VI-NEXT: v_mov_b32_e32 v1, s3
117
- ; VI-NEXT: v_mov_b32_e32 v2, s0
118
114
; VI-NEXT: flat_store_dword v[0:1], v2
119
115
; VI-NEXT: s_endpgm
120
116
;
@@ -123,28 +119,24 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
123
119
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
124
120
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
125
121
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
122
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s1
123
+ ; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, 25
126
124
; GFX9-NEXT: v_mov_b32_e32 v0, s2
127
- ; GFX9-NEXT: s_lshr_b32 s1, s1, 25
128
- ; GFX9-NEXT: s_lshl_b32 s0, s0, 7
129
- ; GFX9-NEXT: s_or_b32 s0, s0, s1
130
125
; GFX9-NEXT: v_mov_b32_e32 v1, s3
131
- ; GFX9-NEXT: v_mov_b32_e32 v2, s0
132
126
; GFX9-NEXT: global_store_dword v[0:1], v2, off
133
127
; GFX9-NEXT: s_endpgm
134
128
;
135
129
; R600-LABEL: fshl_i32_imm:
136
130
; R600: ; %bb.0: ; %entry
137
- ; R600-NEXT: ALU 5 , @4, KC0[CB0:0-32], KC1[]
138
- ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0 .X, T1 .X, 1
131
+ ; R600-NEXT: ALU 3 , @4, KC0[CB0:0-32], KC1[]
132
+ ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1 .X, T0 .X, 1
139
133
; R600-NEXT: CF_END
140
134
; R600-NEXT: PAD
141
135
; R600-NEXT: ALU clause starting at 4:
142
- ; R600-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
143
- ; R600-NEXT: LSHR * T1.W, KC0[2].W, literal.y,
144
- ; R600-NEXT: 7(9.809089e-45), 25(3.503246e-44)
145
- ; R600-NEXT: OR_INT T0.X, PV.W, PS,
146
- ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
136
+ ; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
147
137
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
138
+ ; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
139
+ ; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00)
148
140
entry:
149
141
%0 = call i32 @llvm.fshl.i32 (i32 %x , i32 %y , i32 7 )
150
142
store i32 %0 , i32 addrspace (1 )* %in
@@ -283,14 +275,10 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
283
275
; SI-NEXT: s_mov_b32 s7, 0xf000
284
276
; SI-NEXT: s_mov_b32 s6, -1
285
277
; SI-NEXT: s_waitcnt lgkmcnt(0)
286
- ; SI-NEXT: s_lshl_b32 s3, s3, 9
287
- ; SI-NEXT: s_lshr_b32 s1, s1, 23
288
- ; SI-NEXT: s_lshr_b32 s0, s0, 25
289
- ; SI-NEXT: s_lshl_b32 s2, s2, 7
290
- ; SI-NEXT: s_or_b32 s1, s3, s1
291
- ; SI-NEXT: s_or_b32 s0, s2, s0
278
+ ; SI-NEXT: v_mov_b32_e32 v0, s1
279
+ ; SI-NEXT: v_alignbit_b32 v1, s3, v0, 23
292
280
; SI-NEXT: v_mov_b32_e32 v0, s0
293
- ; SI-NEXT: v_mov_b32_e32 v1, s1
281
+ ; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25
294
282
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
295
283
; SI-NEXT: s_endpgm
296
284
;
@@ -300,15 +288,11 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
300
288
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
301
289
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
302
290
; VI-NEXT: s_waitcnt lgkmcnt(0)
291
+ ; VI-NEXT: v_mov_b32_e32 v0, s1
292
+ ; VI-NEXT: v_mov_b32_e32 v2, s0
293
+ ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23
294
+ ; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25
303
295
; VI-NEXT: v_mov_b32_e32 v2, s2
304
- ; VI-NEXT: s_lshl_b32 s5, s5, 9
305
- ; VI-NEXT: s_lshr_b32 s1, s1, 23
306
- ; VI-NEXT: s_lshr_b32 s0, s0, 25
307
- ; VI-NEXT: s_lshl_b32 s4, s4, 7
308
- ; VI-NEXT: s_or_b32 s1, s5, s1
309
- ; VI-NEXT: s_or_b32 s0, s4, s0
310
- ; VI-NEXT: v_mov_b32_e32 v0, s0
311
- ; VI-NEXT: v_mov_b32_e32 v1, s1
312
296
; VI-NEXT: v_mov_b32_e32 v3, s3
313
297
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
314
298
; VI-NEXT: s_endpgm
@@ -319,34 +303,26 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
319
303
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
320
304
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
321
305
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
306
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s1
307
+ ; GFX9-NEXT: v_mov_b32_e32 v2, s0
308
+ ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23
309
+ ; GFX9-NEXT: v_alignbit_b32 v0, s4, v2, 25
322
310
; GFX9-NEXT: v_mov_b32_e32 v2, s2
323
- ; GFX9-NEXT: s_lshl_b32 s5, s5, 9
324
- ; GFX9-NEXT: s_lshr_b32 s1, s1, 23
325
- ; GFX9-NEXT: s_lshr_b32 s0, s0, 25
326
- ; GFX9-NEXT: s_lshl_b32 s4, s4, 7
327
- ; GFX9-NEXT: s_or_b32 s1, s5, s1
328
- ; GFX9-NEXT: s_or_b32 s0, s4, s0
329
- ; GFX9-NEXT: v_mov_b32_e32 v0, s0
330
- ; GFX9-NEXT: v_mov_b32_e32 v1, s1
331
311
; GFX9-NEXT: v_mov_b32_e32 v3, s3
332
312
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
333
313
; GFX9-NEXT: s_endpgm
334
314
;
335
315
; R600-LABEL: fshl_v2i32_imm:
336
316
; R600: ; %bb.0: ; %entry
337
- ; R600-NEXT: ALU 9 , @4, KC0[CB0:0-32], KC1[]
317
+ ; R600-NEXT: ALU 5 , @4, KC0[CB0:0-32], KC1[]
338
318
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
339
319
; R600-NEXT: CF_END
340
320
; R600-NEXT: PAD
341
321
; R600-NEXT: ALU clause starting at 4:
342
- ; R600-NEXT: LSHL T0.W, KC0[3].X, literal.x,
343
- ; R600-NEXT: LSHR * T1.W, KC0[3].Z, literal.y,
344
- ; R600-NEXT: 9(1.261169e-44), 23(3.222986e-44)
345
- ; R600-NEXT: OR_INT T0.Y, PV.W, PS,
346
- ; R600-NEXT: LSHL T0.W, KC0[2].W, literal.x,
347
- ; R600-NEXT: LSHR * T1.W, KC0[3].Y, literal.y,
348
- ; R600-NEXT: 7(9.809089e-45), 25(3.503246e-44)
349
- ; R600-NEXT: OR_INT T0.X, PV.W, PS,
322
+ ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
323
+ ; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00)
324
+ ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
325
+ ; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00)
350
326
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
351
327
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
352
328
entry:
@@ -557,22 +533,14 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
557
533
; SI-NEXT: s_mov_b32 s7, 0xf000
558
534
; SI-NEXT: s_mov_b32 s6, -1
559
535
; SI-NEXT: s_waitcnt lgkmcnt(0)
560
- ; SI-NEXT: s_lshl_b32 s11, s11, 1
561
- ; SI-NEXT: s_lshr_b32 s3, s3, 31
562
- ; SI-NEXT: s_lshr_b32 s2, s2, 23
563
- ; SI-NEXT: s_lshl_b32 s10, s10, 9
564
- ; SI-NEXT: s_lshr_b32 s1, s1, 25
565
- ; SI-NEXT: s_lshl_b32 s9, s9, 7
566
- ; SI-NEXT: s_lshr_b32 s0, s0, 31
567
- ; SI-NEXT: s_lshl_b32 s8, s8, 1
568
- ; SI-NEXT: s_or_b32 s3, s11, s3
569
- ; SI-NEXT: s_or_b32 s2, s10, s2
570
- ; SI-NEXT: s_or_b32 s1, s9, s1
571
- ; SI-NEXT: s_or_b32 s0, s8, s0
536
+ ; SI-NEXT: v_mov_b32_e32 v0, s3
537
+ ; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31
538
+ ; SI-NEXT: v_mov_b32_e32 v0, s2
539
+ ; SI-NEXT: v_alignbit_b32 v2, s10, v0, 23
540
+ ; SI-NEXT: v_mov_b32_e32 v0, s1
541
+ ; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25
572
542
; SI-NEXT: v_mov_b32_e32 v0, s0
573
- ; SI-NEXT: v_mov_b32_e32 v1, s1
574
- ; SI-NEXT: v_mov_b32_e32 v2, s2
575
- ; SI-NEXT: v_mov_b32_e32 v3, s3
543
+ ; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31
576
544
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
577
545
; SI-NEXT: s_endpgm
578
546
;
@@ -583,23 +551,15 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
583
551
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
584
552
; VI-NEXT: s_waitcnt lgkmcnt(0)
585
553
; VI-NEXT: v_mov_b32_e32 v4, s8
586
- ; VI-NEXT: s_lshl_b32 s7, s7, 1
587
- ; VI-NEXT: s_lshr_b32 s3, s3, 31
588
- ; VI-NEXT: s_lshr_b32 s2, s2, 23
589
- ; VI-NEXT: s_lshl_b32 s6, s6, 9
590
- ; VI-NEXT: s_lshr_b32 s1, s1, 25
591
- ; VI-NEXT: s_lshl_b32 s5, s5, 7
592
- ; VI-NEXT: s_lshr_b32 s0, s0, 31
593
- ; VI-NEXT: s_lshl_b32 s4, s4, 1
594
- ; VI-NEXT: s_or_b32 s3, s7, s3
595
- ; VI-NEXT: s_or_b32 s2, s6, s2
596
- ; VI-NEXT: s_or_b32 s1, s5, s1
597
- ; VI-NEXT: s_or_b32 s0, s4, s0
598
- ; VI-NEXT: v_mov_b32_e32 v0, s0
599
- ; VI-NEXT: v_mov_b32_e32 v1, s1
600
- ; VI-NEXT: v_mov_b32_e32 v2, s2
601
- ; VI-NEXT: v_mov_b32_e32 v3, s3
602
554
; VI-NEXT: v_mov_b32_e32 v5, s9
555
+ ; VI-NEXT: v_mov_b32_e32 v0, s3
556
+ ; VI-NEXT: v_mov_b32_e32 v1, s2
557
+ ; VI-NEXT: v_alignbit_b32 v3, s7, v0, 31
558
+ ; VI-NEXT: v_mov_b32_e32 v0, s1
559
+ ; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23
560
+ ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 25
561
+ ; VI-NEXT: v_mov_b32_e32 v0, s0
562
+ ; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31
603
563
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
604
564
; VI-NEXT: s_endpgm
605
565
;
@@ -610,49 +570,33 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
610
570
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
611
571
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
612
572
; GFX9-NEXT: v_mov_b32_e32 v4, s8
613
- ; GFX9-NEXT: s_lshl_b32 s7, s7, 1
614
- ; GFX9-NEXT: s_lshr_b32 s3, s3, 31
615
- ; GFX9-NEXT: s_lshr_b32 s2, s2, 23
616
- ; GFX9-NEXT: s_lshl_b32 s6, s6, 9
617
- ; GFX9-NEXT: s_lshr_b32 s1, s1, 25
618
- ; GFX9-NEXT: s_lshl_b32 s5, s5, 7
619
- ; GFX9-NEXT: s_lshr_b32 s0, s0, 31
620
- ; GFX9-NEXT: s_lshl_b32 s4, s4, 1
621
- ; GFX9-NEXT: s_or_b32 s3, s7, s3
622
- ; GFX9-NEXT: s_or_b32 s2, s6, s2
623
- ; GFX9-NEXT: s_or_b32 s1, s5, s1
624
- ; GFX9-NEXT: s_or_b32 s0, s4, s0
625
- ; GFX9-NEXT: v_mov_b32_e32 v0, s0
626
- ; GFX9-NEXT: v_mov_b32_e32 v1, s1
627
- ; GFX9-NEXT: v_mov_b32_e32 v2, s2
628
- ; GFX9-NEXT: v_mov_b32_e32 v3, s3
629
573
; GFX9-NEXT: v_mov_b32_e32 v5, s9
574
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s3
575
+ ; GFX9-NEXT: v_mov_b32_e32 v1, s2
576
+ ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31
577
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s1
578
+ ; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23
579
+ ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25
580
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s0
581
+ ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31
630
582
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
631
583
; GFX9-NEXT: s_endpgm
632
584
;
633
585
; R600-LABEL: fshl_v4i32_imm:
634
586
; R600: ; %bb.0: ; %entry
635
- ; R600-NEXT: ALU 17 , @4, KC0[CB0:0-32], KC1[]
587
+ ; R600-NEXT: ALU 9 , @4, KC0[CB0:0-32], KC1[]
636
588
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
637
589
; R600-NEXT: CF_END
638
590
; R600-NEXT: PAD
639
591
; R600-NEXT: ALU clause starting at 4:
640
- ; R600-NEXT: LSHL T0.W, KC0[4].X, 1,
641
- ; R600-NEXT: LSHR * T1.W, KC0[5].X, literal.x,
592
+ ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x,
642
593
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
643
- ; R600-NEXT: LSHL T0.Z, KC0[3].W, literal.x,
644
- ; R600-NEXT: LSHR T2.W, KC0[4].W, literal.y,
645
- ; R600-NEXT: OR_INT * T0.W, PV.W, PS,
646
- ; R600-NEXT: 9(1.261169e-44), 23(3.222986e-44)
647
- ; R600-NEXT: OR_INT T0.Z, PV.Z, PV.W,
648
- ; R600-NEXT: LSHL T1.W, KC0[3].Z, literal.x,
649
- ; R600-NEXT: LSHR * T2.W, KC0[4].Z, literal.y,
650
- ; R600-NEXT: 7(9.809089e-45), 25(3.503246e-44)
651
- ; R600-NEXT: OR_INT T0.Y, PV.W, PS,
652
- ; R600-NEXT: LSHL T1.W, KC0[3].Y, 1,
653
- ; R600-NEXT: LSHR * T2.W, KC0[4].Y, literal.x,
594
+ ; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
595
+ ; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00)
596
+ ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
597
+ ; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00)
598
+ ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x,
654
599
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
655
- ; R600-NEXT: OR_INT T0.X, PV.W, PS,
656
600
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
657
601
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
658
602
entry:
0 commit comments