@@ -3129,8 +3129,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
3129
3129
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
3130
3130
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
3131
3131
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
3132
- ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3133
3132
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3133
+ ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3134
3134
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
3135
3135
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
3136
3136
; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
@@ -3141,14 +3141,13 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
3141
3141
; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3142
3142
; AVX2: # %bb.0:
3143
3143
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
3144
- ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3145
3144
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1
3146
- ; AVX2-NEXT: vinserti128 $1, % xmm1, %ymm0, %ymm0
3147
- ; AVX2-NEXT: vpbroadcastw (%rdi) , %ymm1
3148
- ; AVX2-NEXT: vpaddb (%rsi), %ymm0 , %ymm0
3149
- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1 , %ymm1
3150
- ; AVX2-NEXT: vmovdqa %ymm1 , 32(%rdx)
3151
- ; AVX2-NEXT: vmovdqa %ymm0 , (%rdx)
3145
+ ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
3146
+ ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1 , %ymm1
3147
+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1 , %ymm1
3148
+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0 , %ymm0
3149
+ ; AVX2-NEXT: vmovdqa %ymm0 , 32(%rdx)
3150
+ ; AVX2-NEXT: vmovdqa %ymm1 , (%rdx)
3152
3151
; AVX2-NEXT: vzeroupper
3153
3152
; AVX2-NEXT: retq
3154
3153
;
@@ -3234,13 +3233,17 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
3234
3233
; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
3235
3234
; AVX: # %bb.0:
3236
3235
; AVX-NEXT: vmovdqa (%rdi), %xmm0
3237
- ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
3238
- ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3239
- ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
3236
+ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
3237
+ ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7]
3238
+ ; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2
3239
+ ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3240
+ ; AVX-NEXT: vmovdqa (%rdi), %xmm3
3241
+ ; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3
3240
3242
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
3241
3243
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
3242
- ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
3243
- ; AVX-NEXT: vmovdqa %xmm1, (%rdx)
3244
+ ; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
3245
+ ; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
3246
+ ; AVX-NEXT: vmovdqa %xmm2, (%rdx)
3244
3247
; AVX-NEXT: retq
3245
3248
;
3246
3249
; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
@@ -3516,16 +3519,16 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
3516
3519
; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
3517
3520
; SSE42: # %bb.0:
3518
3521
; SSE42-NEXT: movdqa (%rdi), %xmm0
3519
- ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1 ]
3520
- ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0 [0,1,0,1 ]
3521
- ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0 ]
3522
- ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7 ]
3523
- ; SSE42-NEXT: paddb (%rsi), %xmm0
3524
- ; SSE42-NEXT: paddb 16(%rsi), %xmm2
3525
- ; SSE42-NEXT: paddb 32(%rsi), %xmm1
3526
- ; SSE42-NEXT: movdqa %xmm1 , 32(%rdx)
3527
- ; SSE42-NEXT: movdqa %xmm2 , 16(%rdx)
3528
- ; SSE42-NEXT: movdqa %xmm0 , (%rdx)
3522
+ ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0 ]
3523
+ ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1 [0,1],mem[2,3,4,5],xmm1[6,7 ]
3524
+ ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1 ]
3525
+ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1 ]
3526
+ ; SSE42-NEXT: paddb (%rsi), %xmm1
3527
+ ; SSE42-NEXT: paddb 16(%rsi), %xmm0
3528
+ ; SSE42-NEXT: paddb 32(%rsi), %xmm2
3529
+ ; SSE42-NEXT: movdqa %xmm2 , 32(%rdx)
3530
+ ; SSE42-NEXT: movdqa %xmm0 , 16(%rdx)
3531
+ ; SSE42-NEXT: movdqa %xmm1 , (%rdx)
3529
3532
; SSE42-NEXT: retq
3530
3533
;
3531
3534
; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
@@ -3534,8 +3537,8 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
3534
3537
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
3535
3538
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
3536
3539
; AVX-NEXT: vbroadcastss (%rdi), %xmm2
3540
+ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3537
3541
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3538
- ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3539
3542
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
3540
3543
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
3541
3544
; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
@@ -3546,10 +3549,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
3546
3549
; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
3547
3550
; AVX2: # %bb.0:
3548
3551
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
3552
+ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,0,1,1]
3549
3553
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
3550
- ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
3551
- ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
3552
- ; AVX2-NEXT: vpbroadcastd (%rdi), %ymm1
3554
+ ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
3555
+ ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
3553
3556
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3554
3557
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3555
3558
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -3631,15 +3634,19 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
3631
3634
;
3632
3635
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
3633
3636
; AVX: # %bb.0:
3634
- ; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
3635
- ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
3636
- ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3637
- ; AVX-NEXT: vmovdqa (%rdi), %xmm1
3638
- ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2
3639
- ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
3640
- ; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
3637
+ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
3638
+ ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
3639
+ ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3640
+ ; AVX-NEXT: vmovdqa (%rdi), %xmm2
3641
+ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
3642
+ ; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
3643
+ ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
3644
+ ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
3641
3645
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
3642
- ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
3646
+ ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
3647
+ ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
3648
+ ; AVX-NEXT: vmovdqa %xmm1, (%rdx)
3649
+ ; AVX-NEXT: vzeroupper
3643
3650
; AVX-NEXT: retq
3644
3651
;
3645
3652
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -3701,25 +3708,26 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
3701
3708
define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2 (ptr %in.elt.ptr , ptr %out.vec.bias.ptr , ptr %out.vec.ptr ) nounwind {
3702
3709
; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
3703
3710
; SSE2: # %bb.0:
3704
- ; SSE2-NEXT: movdqa (%rdi), %xmm0
3711
+ ; SSE2-NEXT: movaps (%rdi), %xmm0
3705
3712
; SSE2-NEXT: movaps 48(%rdi), %xmm1
3706
- ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
3707
3713
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
3708
- ; SSE2-NEXT: paddb 16(%rsi), %xmm2
3714
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3715
+ ; SSE2-NEXT: paddb 16(%rsi), %xmm0
3709
3716
; SSE2-NEXT: paddb (%rsi), %xmm1
3710
3717
; SSE2-NEXT: movdqa %xmm1, (%rdx)
3711
- ; SSE2-NEXT: movdqa %xmm2 , 16(%rdx)
3718
+ ; SSE2-NEXT: movdqa %xmm0 , 16(%rdx)
3712
3719
; SSE2-NEXT: retq
3713
3720
;
3714
3721
; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
3715
3722
; SSE42: # %bb.0:
3716
3723
; SSE42-NEXT: movdqa (%rdi), %xmm0
3717
- ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
3718
- ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
3719
- ; SSE42-NEXT: paddb 16(%rsi), %xmm1
3720
- ; SSE42-NEXT: paddb (%rsi), %xmm0
3721
- ; SSE42-NEXT: movdqa %xmm0, (%rdx)
3722
- ; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
3724
+ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
3725
+ ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
3726
+ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3727
+ ; SSE42-NEXT: paddb 16(%rsi), %xmm0
3728
+ ; SSE42-NEXT: paddb (%rsi), %xmm1
3729
+ ; SSE42-NEXT: movdqa %xmm1, (%rdx)
3730
+ ; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
3723
3731
; SSE42-NEXT: retq
3724
3732
;
3725
3733
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
@@ -3812,15 +3820,19 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
3812
3820
;
3813
3821
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
3814
3822
; AVX: # %bb.0:
3815
- ; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
3816
- ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
3817
- ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3818
- ; AVX-NEXT: vmovdqa (%rdi), %xmm1
3819
- ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2
3820
- ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
3821
- ; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
3823
+ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
3824
+ ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
3825
+ ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3826
+ ; AVX-NEXT: vmovdqa (%rdi), %xmm2
3827
+ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
3828
+ ; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
3829
+ ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
3830
+ ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
3822
3831
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
3823
- ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
3832
+ ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
3833
+ ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
3834
+ ; AVX-NEXT: vmovdqa %xmm1, (%rdx)
3835
+ ; AVX-NEXT: vzeroupper
3824
3836
; AVX-NEXT: retq
3825
3837
;
3826
3838
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
0 commit comments