@@ -649,36 +649,35 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
649
649
;
650
650
; GFX9-LABEL: s_test_imin_sle_v4i8:
651
651
; GFX9: ; %bb.0:
652
- ; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c
653
652
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28
653
+ ; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c
654
654
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
655
655
; GFX9-NEXT: v_mov_b32_e32 v0, 0
656
656
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
657
- ; GFX9-NEXT: s_lshr_b32 s5, s2, 16
658
- ; GFX9-NEXT: s_lshr_b32 s8, s3, 16
659
- ; GFX9-NEXT: s_ashr_i32 s9, s3, 24
660
- ; GFX9-NEXT: s_ashr_i32 s6, s2, 24
661
- ; GFX9-NEXT: s_bfe_i32 s8, s8, 0x80000
662
- ; GFX9-NEXT: v_mov_b32_e32 v1, s9
663
- ; GFX9-NEXT: s_bfe_i32 s5, s5, 0x80000
657
+ ; GFX9-NEXT: s_sext_i32_i16 s5, s2
664
658
; GFX9-NEXT: s_sext_i32_i16 s7, s3
665
- ; GFX9-NEXT: v_min_i16_e32 v1, s6, v1
666
- ; GFX9-NEXT: v_mov_b32_e32 v2, s8
667
- ; GFX9-NEXT: s_sext_i32_i16 s4, s2
668
- ; GFX9-NEXT: s_lshr_b32 s7, s7, 8
669
- ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
670
- ; GFX9-NEXT: v_min_i16_e32 v2, s5, v2
671
- ; GFX9-NEXT: s_lshr_b32 s4, s4, 8
672
- ; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80000
673
- ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
674
- ; GFX9-NEXT: v_mov_b32_e32 v2, s7
675
- ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80000
676
- ; GFX9-NEXT: v_min_i16_e32 v2, s4, v2
677
- ; GFX9-NEXT: v_mov_b32_e32 v3, s3
678
- ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
679
- ; GFX9-NEXT: v_min_i16_e32 v3, s2, v3
680
- ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
681
- ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
659
+ ; GFX9-NEXT: s_ashr_i32 s7, s7, 8
660
+ ; GFX9-NEXT: s_ashr_i32 s5, s5, 8
661
+ ; GFX9-NEXT: s_ashr_i32 s4, s2, 24
662
+ ; GFX9-NEXT: s_ashr_i32 s6, s3, 24
663
+ ; GFX9-NEXT: s_min_i32 s5, s5, s7
664
+ ; GFX9-NEXT: s_sext_i32_i8 s7, s3
665
+ ; GFX9-NEXT: s_sext_i32_i8 s8, s2
666
+ ; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80010
667
+ ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80010
668
+ ; GFX9-NEXT: s_min_i32 s7, s8, s7
669
+ ; GFX9-NEXT: s_min_i32 s4, s4, s6
670
+ ; GFX9-NEXT: s_min_i32 s2, s2, s3
671
+ ; GFX9-NEXT: s_lshl_b32 s5, s5, 8
672
+ ; GFX9-NEXT: s_and_b32 s7, s7, 0xff
673
+ ; GFX9-NEXT: s_lshl_b32 s4, s4, 8
674
+ ; GFX9-NEXT: s_and_b32 s2, s2, 0xff
675
+ ; GFX9-NEXT: s_or_b32 s5, s7, s5
676
+ ; GFX9-NEXT: s_or_b32 s2, s2, s4
677
+ ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff
678
+ ; GFX9-NEXT: s_lshl_b32 s2, s2, 16
679
+ ; GFX9-NEXT: s_or_b32 s2, s5, s2
680
+ ; GFX9-NEXT: v_mov_b32_e32 v1, s2
682
681
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
683
682
; GFX9-NEXT: s_endpgm
684
683
;
@@ -688,111 +687,70 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
688
687
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28
689
688
; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c
690
689
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
690
+ ; GFX10-NEXT: v_mov_b32_e32 v0, 0
691
691
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
692
- ; GFX10-NEXT: s_sext_i32_i16 s4 , s2
692
+ ; GFX10-NEXT: s_sext_i32_i16 s5 , s2
693
693
; GFX10-NEXT: s_sext_i32_i16 s7, s3
694
- ; GFX10-NEXT: s_ashr_i32 s6, s2, 24
695
- ; GFX10-NEXT: s_ashr_i32 s9, s3, 24
696
- ; GFX10-NEXT: s_lshr_b32 s4, s4, 8
697
- ; GFX10-NEXT: s_lshr_b32 s7, s7, 8
698
- ; GFX10-NEXT: v_min_i16 v0, s6, s9
699
- ; GFX10-NEXT: v_min_i16 v1, s4, s7
700
- ; GFX10-NEXT: s_lshr_b32 s5, s2, 16
701
- ; GFX10-NEXT: s_lshr_b32 s8, s3, 16
702
- ; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80000
703
- ; GFX10-NEXT: s_bfe_i32 s5, s5, 0x80000
704
- ; GFX10-NEXT: s_bfe_i32 s4, s8, 0x80000
705
- ; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000
706
- ; GFX10-NEXT: v_min_i16 v2, s5, s4
707
- ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
708
- ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1
709
- ; GFX10-NEXT: v_min_i16 v3, s2, s3
710
- ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
711
- ; GFX10-NEXT: v_mov_b32_e32 v2, 0
712
- ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
713
- ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
714
- ; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
694
+ ; GFX10-NEXT: s_ashr_i32 s4, s2, 24
695
+ ; GFX10-NEXT: s_ashr_i32 s6, s3, 24
696
+ ; GFX10-NEXT: s_sext_i32_i8 s8, s3
697
+ ; GFX10-NEXT: s_sext_i32_i8 s9, s2
698
+ ; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80010
699
+ ; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80010
700
+ ; GFX10-NEXT: s_ashr_i32 s7, s7, 8
701
+ ; GFX10-NEXT: s_ashr_i32 s5, s5, 8
702
+ ; GFX10-NEXT: s_min_i32 s8, s9, s8
703
+ ; GFX10-NEXT: s_min_i32 s4, s4, s6
704
+ ; GFX10-NEXT: s_min_i32 s2, s2, s3
705
+ ; GFX10-NEXT: s_min_i32 s3, s5, s7
706
+ ; GFX10-NEXT: s_and_b32 s5, s8, 0xff
707
+ ; GFX10-NEXT: s_lshl_b32 s4, s4, 8
708
+ ; GFX10-NEXT: s_lshl_b32 s3, s3, 8
709
+ ; GFX10-NEXT: s_and_b32 s2, s2, 0xff
710
+ ; GFX10-NEXT: s_or_b32 s3, s5, s3
711
+ ; GFX10-NEXT: s_or_b32 s2, s2, s4
712
+ ; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
713
+ ; GFX10-NEXT: s_lshl_b32 s2, s2, 16
714
+ ; GFX10-NEXT: s_or_b32 s2, s3, s2
715
+ ; GFX10-NEXT: v_mov_b32_e32 v1, s2
716
+ ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
715
717
; GFX10-NEXT: s_endpgm
716
718
;
717
- ; GFX11-TRUE16-LABEL: s_test_imin_sle_v4i8:
718
- ; GFX11-TRUE16: ; %bb.0:
719
- ; GFX11-TRUE16-NEXT: s_clause 0x1
720
- ; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x28
721
- ; GFX11-TRUE16-NEXT: s_load_b32 s1, s[4:5], 0x4c
722
- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
723
- ; GFX11-TRUE16-NEXT: s_sext_i32_i16 s2, s0
724
- ; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16
725
- ; GFX11-TRUE16-NEXT: s_sext_i32_i16 s7, s1
726
- ; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16
727
- ; GFX11-TRUE16-NEXT: s_ashr_i32 s6, s0, 24
728
- ; GFX11-TRUE16-NEXT: s_ashr_i32 s9, s1, 24
729
- ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 8
730
- ; GFX11-TRUE16-NEXT: s_bfe_i32 s3, s3, 0x80000
731
- ; GFX11-TRUE16-NEXT: s_bfe_i32 s0, s0, 0x80000
732
- ; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s7, 8
733
- ; GFX11-TRUE16-NEXT: s_bfe_i32 s8, s8, 0x80000
734
- ; GFX11-TRUE16-NEXT: s_bfe_i32 s1, s1, 0x80000
735
- ; GFX11-TRUE16-NEXT: v_min_i16 v0.l, s6, s9
736
- ; GFX11-TRUE16-NEXT: v_min_i16 v1.l, s3, s8
737
- ; GFX11-TRUE16-NEXT: v_min_i16 v2.l, s2, s7
738
- ; GFX11-TRUE16-NEXT: v_min_i16 v3.l, s0, s1
739
- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
740
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 8, v0
741
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
742
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
743
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
744
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
745
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
746
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
747
- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
748
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
749
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
750
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
751
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
752
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
753
- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
754
- ; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
755
- ; GFX11-TRUE16-NEXT: s_endpgm
756
- ;
757
- ; GFX11-FAKE16-LABEL: s_test_imin_sle_v4i8:
758
- ; GFX11-FAKE16: ; %bb.0:
759
- ; GFX11-FAKE16-NEXT: s_clause 0x1
760
- ; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x28
761
- ; GFX11-FAKE16-NEXT: s_load_b32 s1, s[4:5], 0x4c
762
- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
763
- ; GFX11-FAKE16-NEXT: s_sext_i32_i16 s2, s0
764
- ; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 16
765
- ; GFX11-FAKE16-NEXT: s_sext_i32_i16 s7, s1
766
- ; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 16
767
- ; GFX11-FAKE16-NEXT: s_ashr_i32 s6, s0, 24
768
- ; GFX11-FAKE16-NEXT: s_bfe_i32 s0, s0, 0x80000
769
- ; GFX11-FAKE16-NEXT: s_ashr_i32 s9, s1, 24
770
- ; GFX11-FAKE16-NEXT: s_bfe_i32 s1, s1, 0x80000
771
- ; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 8
772
- ; GFX11-FAKE16-NEXT: s_bfe_i32 s3, s3, 0x80000
773
- ; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s7, 8
774
- ; GFX11-FAKE16-NEXT: s_bfe_i32 s8, s8, 0x80000
775
- ; GFX11-FAKE16-NEXT: v_min_i16 v0, s6, s9
776
- ; GFX11-FAKE16-NEXT: v_min_i16 v1, s0, s1
777
- ; GFX11-FAKE16-NEXT: v_min_i16 v2, s3, s8
778
- ; GFX11-FAKE16-NEXT: v_min_i16 v3, s2, s7
779
- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
780
- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 8, v0
781
- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
782
- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
783
- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
784
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
785
- ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v2, v0
786
- ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v3
787
- ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
788
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
789
- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
790
- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
791
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
792
- ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
793
- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
794
- ; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1]
795
- ; GFX11-FAKE16-NEXT: s_endpgm
719
+ ; GFX11-LABEL: s_test_imin_sle_v4i8:
720
+ ; GFX11: ; %bb.0:
721
+ ; GFX11-NEXT: s_clause 0x2
722
+ ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28
723
+ ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c
724
+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
725
+ ; GFX11-NEXT: v_mov_b32_e32 v0, 0
726
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
727
+ ; GFX11-NEXT: s_sext_i32_i16 s5, s2
728
+ ; GFX11-NEXT: s_sext_i32_i16 s7, s3
729
+ ; GFX11-NEXT: s_ashr_i32 s4, s2, 24
730
+ ; GFX11-NEXT: s_ashr_i32 s6, s3, 24
731
+ ; GFX11-NEXT: s_sext_i32_i8 s8, s3
732
+ ; GFX11-NEXT: s_sext_i32_i8 s9, s2
733
+ ; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80010
734
+ ; GFX11-NEXT: s_bfe_i32 s2, s2, 0x80010
735
+ ; GFX11-NEXT: s_ashr_i32 s7, s7, 8
736
+ ; GFX11-NEXT: s_ashr_i32 s5, s5, 8
737
+ ; GFX11-NEXT: s_min_i32 s8, s9, s8
738
+ ; GFX11-NEXT: s_min_i32 s4, s4, s6
739
+ ; GFX11-NEXT: s_min_i32 s2, s2, s3
740
+ ; GFX11-NEXT: s_min_i32 s3, s5, s7
741
+ ; GFX11-NEXT: s_and_b32 s5, s8, 0xff
742
+ ; GFX11-NEXT: s_lshl_b32 s4, s4, 8
743
+ ; GFX11-NEXT: s_lshl_b32 s3, s3, 8
744
+ ; GFX11-NEXT: s_and_b32 s2, s2, 0xff
745
+ ; GFX11-NEXT: s_or_b32 s3, s5, s3
746
+ ; GFX11-NEXT: s_or_b32 s2, s2, s4
747
+ ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff
748
+ ; GFX11-NEXT: s_lshl_b32 s2, s2, 16
749
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
750
+ ; GFX11-NEXT: s_or_b32 s2, s3, s2
751
+ ; GFX11-NEXT: v_mov_b32_e32 v1, s2
752
+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
753
+ ; GFX11-NEXT: s_endpgm
796
754
%cmp = icmp sle <4 x i8 > %a , %b
797
755
%val = select <4 x i1 > %cmp , <4 x i8 > %a , <4 x i8 > %b
798
756
store <4 x i8 > %val , ptr addrspace (1 ) %out
0 commit comments