@@ -762,10 +762,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
762
762
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
763
763
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
764
764
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
765
- ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
766
- ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
767
- ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
768
- ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
765
+ ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
766
+ ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
767
+ ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
769
768
; AVX512BW-NEXT: vzeroupper
770
769
; AVX512BW-NEXT: retq
771
770
;
@@ -788,10 +787,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
788
787
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
789
788
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
790
789
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
791
- ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
792
- ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
793
- ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
794
- ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
790
+ ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
791
+ ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
792
+ ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
795
793
; AVX512BW-FCP-NEXT: vzeroupper
796
794
; AVX512BW-FCP-NEXT: retq
797
795
;
@@ -814,10 +812,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
814
812
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
815
813
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
816
814
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
817
- ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
818
- ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
819
- ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
820
- ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
815
+ ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
816
+ ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
817
+ ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
821
818
; AVX512DQ-BW-NEXT: vzeroupper
822
819
; AVX512DQ-BW-NEXT: retq
823
820
;
@@ -840,10 +837,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
840
837
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
841
838
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
842
839
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
843
- ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
844
- ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
845
- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
846
- ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
840
+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
841
+ ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
842
+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
847
843
; AVX512DQ-BW-FCP-NEXT: vzeroupper
848
844
; AVX512DQ-BW-FCP-NEXT: retq
849
845
%in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
0 commit comments