@@ -22,19 +22,19 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0
22
22
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
23
23
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
24
24
; VI-NEXT: s_waitcnt lgkmcnt(0)
25
- ; VI-NEXT: s_lshr_b32 s2, s4, 16
26
- ; VI-NEXT: s_sub_i32 s3, 0, s4
25
+ ; VI-NEXT: s_sub_i32 s2, 0, s4
26
+ ; VI-NEXT: s_lshr_b32 s3, s4, 16
27
27
; VI-NEXT: s_ashr_i32 s5, s4, 16
28
+ ; VI-NEXT: s_sub_i32 s3, 0, s3
29
+ ; VI-NEXT: s_sext_i32_i16 s2, s2
28
30
; VI-NEXT: s_sext_i32_i16 s4, s4
29
- ; VI-NEXT: s_sub_i32 s2, 0, s2
30
31
; VI-NEXT: s_sext_i32_i16 s3, s3
31
- ; VI-NEXT: s_sext_i32_i16 s2, s2
32
- ; VI-NEXT: s_max_i32 s3, s4, s3
33
- ; VI-NEXT: s_max_i32 s2, s5, s2
34
- ; VI-NEXT: s_add_i32 s3, s3, 2
35
- ; VI-NEXT: s_lshl_b32 s2, s2, 16
36
- ; VI-NEXT: s_and_b32 s3, s3, 0xffff
37
- ; VI-NEXT: s_or_b32 s2, s2, s3
32
+ ; VI-NEXT: s_max_i32 s2, s4, s2
33
+ ; VI-NEXT: s_max_i32 s3, s5, s3
34
+ ; VI-NEXT: s_add_i32 s2, s2, 2
35
+ ; VI-NEXT: s_lshl_b32 s3, s3, 16
36
+ ; VI-NEXT: s_and_b32 s2, s2, 0xffff
37
+ ; VI-NEXT: s_or_b32 s2, s3, s2
38
38
; VI-NEXT: s_add_i32 s2, s2, 0x20000
39
39
; VI-NEXT: v_mov_b32_e32 v0, s0
40
40
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -171,19 +171,19 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val)
171
171
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
172
172
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
173
173
; VI-NEXT: s_waitcnt lgkmcnt(0)
174
- ; VI-NEXT: s_lshr_b32 s2, s4, 16
175
- ; VI-NEXT: s_sub_i32 s3, 0, s4
174
+ ; VI-NEXT: s_sub_i32 s2, 0, s4
175
+ ; VI-NEXT: s_lshr_b32 s3, s4, 16
176
176
; VI-NEXT: s_ashr_i32 s5, s4, 16
177
+ ; VI-NEXT: s_sub_i32 s3, 0, s3
178
+ ; VI-NEXT: s_sext_i32_i16 s2, s2
177
179
; VI-NEXT: s_sext_i32_i16 s4, s4
178
- ; VI-NEXT: s_sub_i32 s2, 0, s2
179
180
; VI-NEXT: s_sext_i32_i16 s3, s3
180
- ; VI-NEXT: s_sext_i32_i16 s2, s2
181
- ; VI-NEXT: s_max_i32 s3, s4, s3
182
- ; VI-NEXT: s_max_i32 s2, s5, s2
183
- ; VI-NEXT: s_add_i32 s3, s3, 2
184
- ; VI-NEXT: s_lshl_b32 s2, s2, 16
185
- ; VI-NEXT: s_and_b32 s3, s3, 0xffff
186
- ; VI-NEXT: s_or_b32 s2, s2, s3
181
+ ; VI-NEXT: s_max_i32 s2, s4, s2
182
+ ; VI-NEXT: s_max_i32 s3, s5, s3
183
+ ; VI-NEXT: s_add_i32 s2, s2, 2
184
+ ; VI-NEXT: s_lshl_b32 s3, s3, 16
185
+ ; VI-NEXT: s_and_b32 s2, s2, 0xffff
186
+ ; VI-NEXT: s_or_b32 s2, s3, s2
187
187
; VI-NEXT: s_add_i32 s2, s2, 0x20000
188
188
; VI-NEXT: v_mov_b32_e32 v0, s0
189
189
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -331,31 +331,31 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
331
331
; VI: ; %bb.0:
332
332
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
333
333
; VI-NEXT: s_waitcnt lgkmcnt(0)
334
- ; VI-NEXT: s_lshr_b32 s4, s2, 16
335
- ; VI-NEXT: s_lshr_b32 s5, s3, 16
336
- ; VI-NEXT: s_sub_i32 s6, 0, s3
337
- ; VI-NEXT: s_sub_i32 s7, 0, s2
338
- ; VI-NEXT: s_sub_i32 s5, 0, s5
339
- ; VI-NEXT: s_sub_i32 s4, 0, s4
334
+ ; VI-NEXT: s_lshr_b32 s7, s2, 16
335
+ ; VI-NEXT: s_sub_i32 s7, 0, s7
336
+ ; VI-NEXT: s_sub_i32 s4, 0, s3
337
+ ; VI-NEXT: s_lshr_b32 s6, s3, 16
340
338
; VI-NEXT: s_ashr_i32 s8, s2, 16
341
- ; VI-NEXT: s_ashr_i32 s9, s3, 16
342
- ; VI-NEXT: s_sext_i32_i16 s2, s2
343
- ; VI-NEXT: s_sext_i32_i16 s3, s3
344
339
; VI-NEXT: s_sext_i32_i16 s7, s7
345
- ; VI-NEXT: s_sext_i32_i16 s6, s6
340
+ ; VI-NEXT: s_sub_i32 s5, 0, s2
341
+ ; VI-NEXT: s_sub_i32 s6, 0, s6
342
+ ; VI-NEXT: s_max_i32 s7, s8, s7
343
+ ; VI-NEXT: s_ashr_i32 s8, s3, 16
346
344
; VI-NEXT: s_sext_i32_i16 s4, s4
345
+ ; VI-NEXT: s_sext_i32_i16 s3, s3
346
+ ; VI-NEXT: s_sext_i32_i16 s6, s6
347
347
; VI-NEXT: s_sext_i32_i16 s5, s5
348
- ; VI-NEXT: s_max_i32 s3, s3, s6
349
- ; VI-NEXT: s_max_i32 s2, s2, s7
350
- ; VI-NEXT: s_max_i32 s5, s9, s5
351
- ; VI-NEXT: s_max_i32 s4, s8, s4
352
- ; VI-NEXT: s_add_i32 s2, s2, 2
348
+ ; VI-NEXT: s_sext_i32_i16 s2, s2
349
+ ; VI-NEXT: s_max_i32 s3, s3, s4
350
+ ; VI-NEXT: s_max_i32 s6, s8, s6
351
+ ; VI-NEXT: s_max_i32 s2, s2, s5
353
352
; VI-NEXT: s_add_i32 s3, s3, 2
354
- ; VI-NEXT: s_lshl_b32 s4, s4, 16
355
- ; VI-NEXT: s_lshl_b32 s5, s5, 16
353
+ ; VI-NEXT: s_lshl_b32 s4, s6, 16
356
354
; VI-NEXT: s_and_b32 s3, s3, 0xffff
355
+ ; VI-NEXT: s_add_i32 s2, s2, 2
356
+ ; VI-NEXT: s_or_b32 s3, s4, s3
357
+ ; VI-NEXT: s_lshl_b32 s4, s7, 16
357
358
; VI-NEXT: s_and_b32 s2, s2, 0xffff
358
- ; VI-NEXT: s_or_b32 s3, s5, s3
359
359
; VI-NEXT: s_or_b32 s2, s4, s2
360
360
; VI-NEXT: s_add_i32 s3, s3, 0x20000
361
361
; VI-NEXT: s_add_i32 s2, s2, 0x20000
@@ -559,21 +559,21 @@ define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
559
559
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
560
560
; VI-NEXT: s_waitcnt lgkmcnt(0)
561
561
; VI-NEXT: v_mov_b32_e32 v0, s4
562
- ; VI-NEXT: s_ashr_i32 s2, s0, 16
563
- ; VI-NEXT: s_sext_i32_i16 s0, s0
564
- ; VI-NEXT: s_ashr_i32 s3, s1, 16
562
+ ; VI-NEXT: s_ashr_i32 s2, s1, 16
563
+ ; VI-NEXT: s_ashr_i32 s3, s0, 16
565
564
; VI-NEXT: s_sext_i32_i16 s1, s1
565
+ ; VI-NEXT: s_sext_i32_i16 s0, s0
566
566
; VI-NEXT: v_mov_b32_e32 v1, s5
567
- ; VI-NEXT: s_max_i32 s4, s2, s3
567
+ ; VI-NEXT: s_max_i32 s4, s3, s2
568
568
; VI-NEXT: s_max_i32 s5, s0, s1
569
569
; VI-NEXT: s_lshl_b32 s4, s4, 16
570
570
; VI-NEXT: s_and_b32 s5, s5, 0xffff
571
- ; VI-NEXT: s_min_i32 s2, s2, s3
571
+ ; VI-NEXT: s_min_i32 s2, s3, s2
572
572
; VI-NEXT: s_min_i32 s0, s0, s1
573
573
; VI-NEXT: s_or_b32 s4, s5, s4
574
- ; VI-NEXT: s_lshl_b32 s1 , s2, 16
574
+ ; VI-NEXT: s_lshl_b32 s2 , s2, 16
575
575
; VI-NEXT: s_and_b32 s0, s0, 0xffff
576
- ; VI-NEXT: s_or_b32 s0, s0, s1
576
+ ; VI-NEXT: s_or_b32 s0, s0, s2
577
577
; VI-NEXT: v_mov_b32_e32 v4, s4
578
578
; VI-NEXT: v_mov_b32_e32 v2, s6
579
579
; VI-NEXT: v_mov_b32_e32 v3, s7
@@ -661,12 +661,12 @@ define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
661
661
; VI-NEXT: v_mov_b32_e32 v1, s1
662
662
; VI-NEXT: v_mov_b32_e32 v2, s2
663
663
; VI-NEXT: v_mov_b32_e32 v3, s3
664
- ; VI-NEXT: v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
665
- ; VI-NEXT: v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
666
- ; VI-NEXT: v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
667
- ; VI-NEXT: v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
668
- ; VI-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
669
- ; VI-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
664
+ ; VI-NEXT: v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
665
+ ; VI-NEXT: v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
666
+ ; VI-NEXT: v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
667
+ ; VI-NEXT: v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
668
+ ; VI-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
669
+ ; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
670
670
; VI-NEXT: flat_store_dword v[0:1], v5
671
671
; VI-NEXT: s_waitcnt vmcnt(0)
672
672
; VI-NEXT: flat_store_dword v[2:3], v4
@@ -748,37 +748,37 @@ define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace
748
748
; VI-NEXT: s_waitcnt lgkmcnt(0)
749
749
; VI-NEXT: v_mov_b32_e32 v0, s0
750
750
; VI-NEXT: v_mov_b32_e32 v1, s1
751
- ; VI-NEXT: v_mov_b32_e32 v2, s2
752
751
; VI-NEXT: v_mov_b32_e32 v3, s3
753
- ; VI-NEXT: s_ashr_i32 s0, s5, 16
754
- ; VI-NEXT: s_ashr_i32 s1, s4, 16
755
- ; VI-NEXT: s_sext_i32_i16 s2, s5
756
- ; VI-NEXT: s_sext_i32_i16 s3, s4
757
- ; VI-NEXT: s_ashr_i32 s4, s7, 16
758
- ; VI-NEXT: s_ashr_i32 s5, s6, 16
759
- ; VI-NEXT: s_sext_i32_i16 s7, s7
752
+ ; VI-NEXT: s_ashr_i32 s0, s7, 16
753
+ ; VI-NEXT: s_ashr_i32 s1, s5, 16
754
+ ; VI-NEXT: s_sext_i32_i16 s3, s7
755
+ ; VI-NEXT: s_sext_i32_i16 s5, s5
756
+ ; VI-NEXT: v_mov_b32_e32 v2, s2
757
+ ; VI-NEXT: s_max_i32 s2, s1, s0
758
+ ; VI-NEXT: s_max_i32 s7, s5, s3
759
+ ; VI-NEXT: s_lshl_b32 s2, s2, 16
760
+ ; VI-NEXT: s_and_b32 s7, s7, 0xffff
761
+ ; VI-NEXT: s_or_b32 s2, s7, s2
762
+ ; VI-NEXT: s_ashr_i32 s7, s6, 16
763
+ ; VI-NEXT: s_ashr_i32 s8, s4, 16
760
764
; VI-NEXT: s_sext_i32_i16 s6, s6
761
- ; VI-NEXT: s_max_i32 s8, s1, s5
762
- ; VI-NEXT: s_max_i32 s9, s0, s4
763
- ; VI-NEXT: s_max_i32 s10, s3, s6
764
- ; VI-NEXT: s_max_i32 s11, s2, s7
765
- ; VI-NEXT: s_min_i32 s0, s0, s4
766
- ; VI-NEXT: s_min_i32 s2, s2, s7
765
+ ; VI-NEXT: s_sext_i32_i16 s4, s4
766
+ ; VI-NEXT: s_min_i32 s0, s1, s0
767
+ ; VI-NEXT: s_min_i32 s1, s5, s3
768
+ ; VI-NEXT: s_max_i32 s9, s8, s7
769
+ ; VI-NEXT: s_max_i32 s10, s4, s6
770
+ ; VI-NEXT: s_lshl_b32 s0, s0, 16
771
+ ; VI-NEXT: s_and_b32 s1, s1, 0xffff
767
772
; VI-NEXT: s_lshl_b32 s9, s9, 16
768
- ; VI-NEXT: s_and_b32 s11, s11, 0xffff
769
- ; VI-NEXT: s_lshl_b32 s8, s8, 16
770
773
; VI-NEXT: s_and_b32 s10, s10, 0xffff
771
- ; VI-NEXT: s_min_i32 s1, s1, s5
772
- ; VI-NEXT: s_min_i32 s3, s3, s6
773
- ; VI-NEXT: s_lshl_b32 s0, s0, 16
774
- ; VI-NEXT: s_and_b32 s2, s2, 0xffff
775
- ; VI-NEXT: s_or_b32 s9, s11, s9
776
- ; VI-NEXT: s_or_b32 s8, s10, s8
777
- ; VI-NEXT: s_or_b32 s0, s2, s0
774
+ ; VI-NEXT: v_mov_b32_e32 v5, s2
775
+ ; VI-NEXT: s_or_b32 s0, s1, s0
776
+ ; VI-NEXT: s_min_i32 s1, s8, s7
777
+ ; VI-NEXT: s_min_i32 s2, s4, s6
778
+ ; VI-NEXT: s_or_b32 s9, s10, s9
778
779
; VI-NEXT: s_lshl_b32 s1, s1, 16
779
- ; VI-NEXT: s_and_b32 s2, s3, 0xffff
780
- ; VI-NEXT: v_mov_b32_e32 v4, s8
781
- ; VI-NEXT: v_mov_b32_e32 v5, s9
780
+ ; VI-NEXT: s_and_b32 s2, s2, 0xffff
781
+ ; VI-NEXT: v_mov_b32_e32 v4, s9
782
782
; VI-NEXT: s_or_b32 s1, s2, s1
783
783
; VI-NEXT: v_mov_b32_e32 v6, s1
784
784
; VI-NEXT: v_mov_b32_e32 v7, s0
@@ -861,26 +861,26 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
861
861
; GFX9-NEXT: global_load_dword v2, v0, s[10:11] glc
862
862
; GFX9-NEXT: s_waitcnt vmcnt(0)
863
863
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
864
+ ; GFX9-NEXT: v_cmp_gt_i32_sdwa vcc, sext(v1), sext(v2) src0_sel:WORD_0 src1_sel:WORD_0
865
+ ; GFX9-NEXT: v_cmp_gt_i32_sdwa s[0:1], sext(v1), sext(v2) src0_sel:WORD_1 src1_sel:WORD_1
864
866
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2
865
- ; GFX9-NEXT: v_cmp_gt_i16_e32 vcc, v1, v2
866
867
; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v1, vcc
867
- ; GFX9-NEXT: v_cmp_gt_i16_e64 s[0:1], v3, v4
868
868
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
869
+ ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
869
870
; GFX9-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[0:1]
870
- ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5
871
871
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
872
+ ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v5
873
+ ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
872
874
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
873
- ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
874
- ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
875
- ; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5
875
+ ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2
876
+ ; GFX9-NEXT: v_lshl_or_b32 v4, v6, 16, v4
876
877
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
877
- ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v4
878
- ; GFX9-NEXT: global_store_dword v0, v5 , s[4:5]
878
+ ; GFX9-NEXT: v_or_b32_e32 v2, v5, v2
879
+ ; GFX9-NEXT: global_store_dword v0, v4 , s[4:5]
879
880
; GFX9-NEXT: s_waitcnt vmcnt(0)
880
881
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
881
882
; GFX9-NEXT: s_waitcnt vmcnt(0)
882
- ; GFX9-NEXT: v_or_b32_e32 v0, v2, v3
883
- ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
883
+ ; GFX9-NEXT: v_and_b32_e32 v0, 3, v2
884
884
; GFX9-NEXT: global_store_byte v[0:1], v0, off
885
885
; GFX9-NEXT: s_waitcnt vmcnt(0)
886
886
; GFX9-NEXT: s_endpgm
@@ -899,42 +899,34 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
899
899
; VI-NEXT: s_waitcnt vmcnt(0)
900
900
; VI-NEXT: v_mov_b32_e32 v0, s0
901
901
; VI-NEXT: v_mov_b32_e32 v1, s1
902
- ; VI-NEXT: v_mov_b32_e32 v3, s3
903
902
; VI-NEXT: v_mov_b32_e32 v2, s2
904
- ; VI-NEXT: v_readfirstlane_b32 s0, v4
905
- ; VI-NEXT: v_readfirstlane_b32 s1, v5
906
- ; VI-NEXT: s_ashr_i32 s3, s0, 16
907
- ; VI-NEXT: s_ashr_i32 s5, s1, 16
908
- ; VI-NEXT: s_cmp_gt_i32 s3, s5
909
- ; VI-NEXT: s_sext_i32_i16 s2, s0
910
- ; VI-NEXT: s_sext_i32_i16 s4, s1
911
- ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
912
- ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
913
- ; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
914
- ; VI-NEXT: s_cselect_b32 s0, s3, s5
915
- ; VI-NEXT: s_cselect_b32 s3, s5, s3
916
- ; VI-NEXT: s_lshl_b32 s5, s0, 16
917
- ; VI-NEXT: s_cmp_gt_i32 s2, s4
918
- ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
919
- ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
920
- ; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
921
- ; VI-NEXT: s_cselect_b32 s0, s2, s4
922
- ; VI-NEXT: s_cselect_b32 s1, s4, s2
923
- ; VI-NEXT: s_and_b32 s0, s0, 0xffff
924
- ; VI-NEXT: v_lshlrev_b16_e32 v4, 1, v4
925
- ; VI-NEXT: s_lshl_b32 s2, s3, 16
926
- ; VI-NEXT: s_and_b32 s1, s1, 0xffff
927
- ; VI-NEXT: s_or_b32 s0, s0, s5
928
- ; VI-NEXT: v_or_b32_e32 v4, v5, v4
929
- ; VI-NEXT: s_or_b32 s1, s1, s2
930
- ; VI-NEXT: v_mov_b32_e32 v5, s0
931
- ; VI-NEXT: v_and_b32_e32 v4, 3, v4
932
- ; VI-NEXT: v_mov_b32_e32 v6, s1
933
- ; VI-NEXT: flat_store_dword v[0:1], v5
903
+ ; VI-NEXT: v_mov_b32_e32 v3, s3
904
+ ; VI-NEXT: v_ashrrev_i32_e32 v10, 16, v4
905
+ ; VI-NEXT: v_ashrrev_i32_e32 v11, 16, v5
906
+ ; VI-NEXT: v_bfe_i32 v6, v4, 0, 16
907
+ ; VI-NEXT: v_bfe_i32 v7, v5, 0, 16
908
+ ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v4
909
+ ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v5
910
+ ; VI-NEXT: v_cmp_gt_i32_e32 vcc, v10, v11
911
+ ; VI-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
912
+ ; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], v6, v7
913
+ ; VI-NEXT: v_cndmask_b32_e64 v6, v5, v4, s[0:1]
914
+ ; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
915
+ ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v10
916
+ ; VI-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
917
+ ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
918
+ ; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1]
919
+ ; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
920
+ ; VI-NEXT: v_lshlrev_b32_e32 v5, 1, v5
921
+ ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
922
+ ; VI-NEXT: flat_store_dword v[0:1], v6
934
923
; VI-NEXT: s_waitcnt vmcnt(0)
935
- ; VI-NEXT: flat_store_dword v[2:3], v6
924
+ ; VI-NEXT: v_or_b32_e32 v0, v9, v5
925
+ ; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
926
+ ; VI-NEXT: v_and_b32_e32 v0, 3, v0
927
+ ; VI-NEXT: flat_store_dword v[2:3], v4
936
928
; VI-NEXT: s_waitcnt vmcnt(0)
937
- ; VI-NEXT: flat_store_byte v[0:1], v4
929
+ ; VI-NEXT: flat_store_byte v[0:1], v0
938
930
; VI-NEXT: s_waitcnt vmcnt(0)
939
931
; VI-NEXT: s_endpgm
940
932
;
@@ -1021,19 +1013,19 @@ define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
1021
1013
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1022
1014
; VI-NEXT: s_waitcnt lgkmcnt(0)
1023
1015
; VI-NEXT: v_mov_b32_e32 v1, s5
1024
- ; VI-NEXT: s_lshr_b32 s2, s0, 16
1025
- ; VI-NEXT: s_lshr_b32 s3, s1, 16
1026
- ; VI-NEXT: s_and_b32 s0, s0, 0xffff
1027
- ; VI-NEXT: s_and_b32 s1, s1, 0xffff
1028
- ; VI-NEXT: s_max_u32 s5, s2, s3
1016
+ ; VI-NEXT: s_and_b32 s2, s1, 0xffff
1017
+ ; VI-NEXT: s_and_b32 s3, s0, 0xffff
1018
+ ; VI-NEXT: s_lshr_b32 s1, s1, 16
1019
+ ; VI-NEXT: s_lshr_b32 s0, s0, 16
1020
+ ; VI-NEXT: s_max_u32 s5, s0, s1
1029
1021
; VI-NEXT: v_mov_b32_e32 v0, s4
1030
- ; VI-NEXT: s_max_u32 s4, s0, s1
1022
+ ; VI-NEXT: s_max_u32 s4, s3, s2
1031
1023
; VI-NEXT: s_lshl_b32 s5, s5, 16
1032
1024
; VI-NEXT: s_min_u32 s0, s0, s1
1033
- ; VI-NEXT: s_min_u32 s1, s2, s3
1034
1025
; VI-NEXT: s_or_b32 s4, s4, s5
1035
- ; VI-NEXT: s_lshl_b32 s1, s1, 16
1036
- ; VI-NEXT: s_or_b32 s0, s0, s1
1026
+ ; VI-NEXT: s_min_u32 s2, s3, s2
1027
+ ; VI-NEXT: s_lshl_b32 s0, s0, 16
1028
+ ; VI-NEXT: s_or_b32 s0, s2, s0
1037
1029
; VI-NEXT: v_mov_b32_e32 v4, s4
1038
1030
; VI-NEXT: v_mov_b32_e32 v2, s6
1039
1031
; VI-NEXT: v_mov_b32_e32 v3, s7
0 commit comments