@@ -658,26 +658,47 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
658
658
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
659
659
; GFX10-NEXT: s_endpgm
660
660
;
661
- ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32:
662
- ; GFX11: ; %bb.0:
663
- ; GFX11-NEXT: s_clause 0x1
664
- ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
665
- ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
666
- ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
667
- ; GFX11-NEXT: v_mov_b32_e32 v2, 0
668
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
669
- ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
670
- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
671
- ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
672
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
673
- ; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
674
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
675
- ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
676
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
677
- ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
678
- ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
679
- ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
680
- ; GFX11-NEXT: s_endpgm
661
+ ; GFX11-TRUE16-LABEL: v_test_add_v2i16_zext_to_v2i32:
662
+ ; GFX11-TRUE16: ; %bb.0:
663
+ ; GFX11-TRUE16-NEXT: s_clause 0x1
664
+ ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
665
+ ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
666
+ ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
667
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
668
+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
669
+ ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
670
+ ; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
671
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
672
+ ; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
673
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
674
+ ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v1, v0
675
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
676
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
677
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
678
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
679
+ ; GFX11-TRUE16-NEXT: global_store_b64 v3, v[0:1], s[0:1]
680
+ ; GFX11-TRUE16-NEXT: s_endpgm
681
+ ;
682
+ ; GFX11-FAKE16-LABEL: v_test_add_v2i16_zext_to_v2i32:
683
+ ; GFX11-FAKE16: ; %bb.0:
684
+ ; GFX11-FAKE16-NEXT: s_clause 0x1
685
+ ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
686
+ ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
687
+ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
688
+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
689
+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
690
+ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
691
+ ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
692
+ ; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
693
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
694
+ ; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
695
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
696
+ ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v1, v0
697
+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
698
+ ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
699
+ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
700
+ ; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
701
+ ; GFX11-FAKE16-NEXT: s_endpgm
681
702
%tid = call i32 @llvm.amdgcn.workitem.id.x ()
682
703
%gep.out = getelementptr inbounds <2 x i32 >, ptr addrspace (1 ) %out , i32 %tid
683
704
%gep.in0 = getelementptr inbounds <2 x i16 >, ptr addrspace (1 ) %in0 , i32 %tid
@@ -971,30 +992,57 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
971
992
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
972
993
; GFX10-NEXT: s_endpgm
973
994
;
974
- ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64:
975
- ; GFX11: ; %bb.0:
976
- ; GFX11-NEXT: s_clause 0x1
977
- ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
978
- ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
979
- ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
980
- ; GFX11-NEXT: v_mov_b32_e32 v4, 0
981
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
982
- ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
983
- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
984
- ; GFX11-NEXT: s_clause 0x1
985
- ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
986
- ; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
987
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
988
- ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
989
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
990
- ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
991
- ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
992
- ; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16
993
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
994
- ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
995
- ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
996
- ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
997
- ; GFX11-NEXT: s_endpgm
995
+ ; GFX11-TRUE16-LABEL: v_test_add_v2i16_sext_to_v2i64:
996
+ ; GFX11-TRUE16: ; %bb.0:
997
+ ; GFX11-TRUE16-NEXT: s_clause 0x1
998
+ ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
999
+ ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1000
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1001
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
1002
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1003
+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1004
+ ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1005
+ ; GFX11-TRUE16-NEXT: s_clause 0x1
1006
+ ; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
1007
+ ; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[4:5]
1008
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1009
+ ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0
1010
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
1011
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1012
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
1013
+ ; GFX11-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
1014
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1015
+ ; GFX11-TRUE16-NEXT: v_bfe_i32 v2, v1, 0, 16
1016
+ ; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1017
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1018
+ ; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2
1019
+ ; GFX11-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1020
+ ; GFX11-TRUE16-NEXT: s_endpgm
1021
+ ;
1022
+ ; GFX11-FAKE16-LABEL: v_test_add_v2i16_sext_to_v2i64:
1023
+ ; GFX11-FAKE16: ; %bb.0:
1024
+ ; GFX11-FAKE16-NEXT: s_clause 0x1
1025
+ ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1026
+ ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1027
+ ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1028
+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
1029
+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1030
+ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1031
+ ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1032
+ ; GFX11-FAKE16-NEXT: s_clause 0x1
1033
+ ; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
1034
+ ; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[4:5]
1035
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1036
+ ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v1, v0
1037
+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1038
+ ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1039
+ ; GFX11-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
1040
+ ; GFX11-FAKE16-NEXT: v_bfe_i32 v2, v1, 0, 16
1041
+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1042
+ ; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1043
+ ; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2
1044
+ ; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1045
+ ; GFX11-FAKE16-NEXT: s_endpgm
998
1046
%tid = call i32 @llvm.amdgcn.workitem.id.x ()
999
1047
%gep.out = getelementptr inbounds <2 x i64 >, ptr addrspace (1 ) %out , i32 %tid
1000
1048
%gep.in0 = getelementptr inbounds <2 x i16 >, ptr addrspace (1 ) %in0 , i32 %tid
0 commit comments