@@ -916,3 +916,88 @@ exit:
916
916
%r2 = select <4 x i1 > %b2 , <4 x half > <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half > <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
917
917
ret <4 x half > %r2
918
918
}
919
+
920
+ define <8 x i16 > @large_vector (ptr addrspace (3 ) %p , i32 %idxp ) {
921
+ ; SI-LABEL: large_vector:
922
+ ; SI: ; %bb.0:
923
+ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
924
+ ; SI-NEXT: v_lshlrev_b32_e32 v1, 5, v1
925
+ ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
926
+ ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0
927
+ ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0
928
+ ; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0
929
+ ; SI-NEXT: s_mov_b32 m0, -1
930
+ ; SI-NEXT: ds_read_b32 v0, v0
931
+ ; SI-NEXT: ds_read_b32 v2, v1
932
+ ; SI-NEXT: ds_read_b32 v4, v3
933
+ ; SI-NEXT: ds_read_b32 v6, v5
934
+ ; SI-NEXT: s_waitcnt lgkmcnt(3)
935
+ ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
936
+ ; SI-NEXT: s_waitcnt lgkmcnt(2)
937
+ ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
938
+ ; SI-NEXT: s_waitcnt lgkmcnt(1)
939
+ ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
940
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
941
+ ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
942
+ ; SI-NEXT: s_setpc_b64 s[30:31]
943
+ ;
944
+ ; GFX9-LABEL: large_vector:
945
+ ; GFX9: ; %bb.0:
946
+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
947
+ ; GFX9-NEXT: v_lshl_add_u32 v2, v1, 5, v0
948
+ ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
949
+ ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
950
+ ; GFX9-NEXT: s_mov_b32 s4, 0xffff
951
+ ; GFX9-NEXT: s_waitcnt lgkmcnt(1)
952
+ ; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v0
953
+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
954
+ ; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v2
955
+ ; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4
956
+ ; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4
957
+ ; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v5
958
+ ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v5
959
+ ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
960
+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
961
+ %idx = shl i32 %idxp , 4
962
+
963
+ %i.0 = or i32 %idx , 0
964
+ %p.0 = getelementptr half , ptr addrspace (3 ) %p , i32 %i.0
965
+ %x.0 = load i16 , ptr addrspace (3 ) %p.0 , align 4
966
+ %v0p = insertelement <8 x i16 > poison, i16 %x.0 , i32 0
967
+ %i.1 = or i32 %idx , 1
968
+ %p.1 = getelementptr half , ptr addrspace (3 ) %p , i32 %i.1
969
+ %x.1 = load i16 , ptr addrspace (3 ) %p.1 , align 2
970
+ %v0 = insertelement <8 x i16 > %v0p , i16 %x.1 , i32 1
971
+
972
+ %i.2 = or i32 %idx , 2
973
+ %p.2 = getelementptr half , ptr addrspace (3 ) %p , i32 %i.2
974
+ %x.2 = load i16 , ptr addrspace (3 ) %p.2 , align 4
975
+ %v1p = insertelement <8 x i16 > poison, i16 %x.2 , i32 0
976
+ %i.3 = or i32 %idx , 3
977
+ %p.3 = getelementptr half , ptr addrspace (3 ) %p , i32 %i.3
978
+ %x.3 = load i16 , ptr addrspace (3 ) %p.3 , align 2
979
+ %v1 = insertelement <8 x i16 > %v1p , i16 %x.3 , i32 1
980
+
981
+ %i.4 = or i32 %idx , 4
982
+ %p.4 = getelementptr half , ptr addrspace (3 ) %p , i32 %i.4
983
+ %x.4 = load i16 , ptr addrspace (3 ) %p.4 , align 4
984
+ %v2p = insertelement <8 x i16 > poison, i16 %x.4 , i32 0
985
+ %i.5 = or i32 %idx , 5
986
+ %p.5 = getelementptr half , ptr addrspace (3 ) %p , i32 %i.5
987
+ %x.5 = load i16 , ptr addrspace (3 ) %p.5 , align 2
988
+ %v2 = insertelement <8 x i16 > %v2p , i16 %x.5 , i32 1
989
+
990
+ %i.6 = or i32 %idx , 6
991
+ %p.6 = getelementptr half , ptr addrspace (3 ) %p , i32 %i.6
992
+ %x.6 = load i16 , ptr addrspace (3 ) %p.6 , align 4
993
+ %v3p = insertelement <8 x i16 > poison, i16 %x.6 , i32 0
994
+ %i.7 = or i32 %idx , 7
995
+ %p.7 = getelementptr half , ptr addrspace (3 ) %p , i32 %i.7
996
+ %x.7 = load i16 , ptr addrspace (3 ) %p.7 , align 2
997
+ %v3 = insertelement <8 x i16 > %v3p , i16 %x.7 , i32 1
998
+
999
+ %z.1 = shufflevector <8 x i16 > %v0 , <8 x i16 > %v1 , <8 x i32 > <i32 0 , i32 1 , i32 8 , i32 9 , i32 undef , i32 undef , i32 undef , i32 undef >
1000
+ %z.2 = shufflevector <8 x i16 > %z.1 , <8 x i16 > %v2 , <8 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 8 , i32 9 , i32 undef , i32 undef >
1001
+ %z.3 = shufflevector <8 x i16 > %z.2 , <8 x i16 > %v3 , <8 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 8 , i32 9 >
1002
+ ret <8 x i16 > %z.3
1003
+ }
0 commit comments