@@ -641,25 +641,16 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
641
641
; AVX512DQ-NEXT: vzeroupper
642
642
; AVX512DQ-NEXT: retq
643
643
;
644
- ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
645
- ; AVX512BW-SLOW: # %bb.0:
646
- ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
647
- ; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
648
- ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
649
- ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
650
- ; AVX512BW-SLOW-NEXT: vzeroupper
651
- ; AVX512BW-SLOW-NEXT: retq
652
- ;
653
- ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
654
- ; AVX512BW-FAST: # %bb.0:
655
- ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
656
- ; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
657
- ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
658
- ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
659
- ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
660
- ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
661
- ; AVX512BW-FAST-NEXT: vzeroupper
662
- ; AVX512BW-FAST-NEXT: retq
644
+ ; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
645
+ ; AVX512BW: # %bb.0:
646
+ ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [16,25,16,27,16,29,0,23]
647
+ ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
648
+ ; AVX512BW-NEXT: vpermt2w (%rdi), %ymm0, %ymm1
649
+ ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],mem[7]
650
+ ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
651
+ ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
652
+ ; AVX512BW-NEXT: vzeroupper
653
+ ; AVX512BW-NEXT: retq
663
654
%in.vec = load <64 x i8 >, ptr %in.elt.ptr , align 64
664
655
%in.vec.cast = bitcast <64 x i8 > %in.vec to <32 x i16 >
665
656
%broadcast.of.zextinreg = shufflevector <32 x i16 > %in.vec.cast , <32 x i16 > poison, <8 x i32 > <i32 0 , i32 9 , i32 0 , i32 11 , i32 0 , i32 13 , i32 0 , i32 15 >
@@ -735,25 +726,18 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
735
726
; AVX512DQ-NEXT: vzeroupper
736
727
; AVX512DQ-NEXT: retq
737
728
;
738
- ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
739
- ; AVX512BW-SLOW: # %bb.0:
740
- ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
741
- ; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
742
- ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
743
- ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
744
- ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
745
- ; AVX512BW-SLOW-NEXT: vzeroupper
746
- ; AVX512BW-SLOW-NEXT: retq
747
- ;
748
- ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
749
- ; AVX512BW-FAST: # %bb.0:
750
- ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
751
- ; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
752
- ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
753
- ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
754
- ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
755
- ; AVX512BW-FAST-NEXT: vzeroupper
756
- ; AVX512BW-FAST-NEXT: retq
729
+ ; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
730
+ ; AVX512BW: # %bb.0:
731
+ ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
732
+ ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
733
+ ; AVX512BW-NEXT: vmovd %xmm0, %eax
734
+ ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
735
+ ; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
736
+ ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
737
+ ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
738
+ ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
739
+ ; AVX512BW-NEXT: vzeroupper
740
+ ; AVX512BW-NEXT: retq
757
741
%in.vec = load <64 x i8 >, ptr %in.elt.ptr , align 64
758
742
%in.vec.cast = bitcast <64 x i8 > %in.vec to <32 x i16 >
759
743
%broadcast.of.zextinreg = shufflevector <32 x i16 > %in.vec.cast , <32 x i16 > poison, <8 x i32 > <i32 0 , i32 9 , i32 10 , i32 11 , i32 0 , i32 13 , i32 14 , i32 15 >
0 commit comments