@@ -3652,13 +3652,11 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
3652
3652
; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3653
3653
; AVX512BW-FAST: # %bb.0:
3654
3654
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
3655
- ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47]
3656
3655
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3657
- ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
3658
- ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
3659
- ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
3660
- ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3661
- ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3656
+ ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
3657
+ ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,0,0,0,0,0,0,0]
3658
+ ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3659
+ ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
3662
3660
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
3663
3661
; AVX512BW-FAST-NEXT: vzeroupper
3664
3662
; AVX512BW-FAST-NEXT: retq
@@ -3857,13 +3855,11 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
3857
3855
; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3858
3856
; AVX512BW-FAST: # %bb.0:
3859
3857
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
3860
- ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0]
3861
3858
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3862
- ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
3863
- ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
3864
- ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero
3865
- ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3866
- ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3859
+ ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
3860
+ ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,0,0,0,0,0,0,0,0]
3861
+ ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3862
+ ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
3867
3863
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
3868
3864
; AVX512BW-FAST-NEXT: vzeroupper
3869
3865
; AVX512BW-FAST-NEXT: retq
@@ -4085,13 +4081,11 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
4085
4081
; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4086
4082
; AVX512BW-FAST: # %bb.0:
4087
4083
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4088
- ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47]
4089
4084
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4090
- ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4091
- ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
4092
- ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
4093
- ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4094
- ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4085
+ ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
4086
+ ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,0,0,0,0,0,0,0]
4087
+ ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
4088
+ ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
4095
4089
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4096
4090
; AVX512BW-FAST-NEXT: vzeroupper
4097
4091
; AVX512BW-FAST-NEXT: retq
@@ -4292,13 +4286,11 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
4292
4286
; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4293
4287
; AVX512BW-FAST: # %bb.0:
4294
4288
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4295
- ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47]
4296
4289
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4297
- ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4298
- ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
4299
- ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4300
- ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4301
- ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4290
+ ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
4291
+ ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47,48,49,0,51,52,53,54,55,0,0,0,0,0,0,0,0]
4292
+ ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
4293
+ ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
4302
4294
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4303
4295
; AVX512BW-FAST-NEXT: vzeroupper
4304
4296
; AVX512BW-FAST-NEXT: retq
@@ -5101,32 +5093,17 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
5101
5093
; AVX512DQ-NEXT: vzeroupper
5102
5094
; AVX512DQ-NEXT: retq
5103
5095
;
5104
- ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5105
- ; AVX512BW-SLOW: # %bb.0:
5106
- ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
5107
- ; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,21,22,23]
5108
- ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5109
- ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5110
- ; AVX512BW-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
5111
- ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5112
- ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5113
- ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
5114
- ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5115
- ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
5116
- ; AVX512BW-SLOW-NEXT: vzeroupper
5117
- ; AVX512BW-SLOW-NEXT: retq
5118
- ;
5119
- ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5120
- ; AVX512BW-FAST: # %bb.0:
5121
- ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
5122
- ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5123
- ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
5124
- ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31]
5125
- ; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5126
- ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
5127
- ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
5128
- ; AVX512BW-FAST-NEXT: vzeroupper
5129
- ; AVX512BW-FAST-NEXT: retq
5096
+ ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5097
+ ; AVX512BW: # %bb.0:
5098
+ ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
5099
+ ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5100
+ ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5101
+ ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31]
5102
+ ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5103
+ ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
5104
+ ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5105
+ ; AVX512BW-NEXT: vzeroupper
5106
+ ; AVX512BW-NEXT: retq
5130
5107
%in.vec.base = load <64 x i8 >, ptr %in.vec.base.ptr , align 64
5131
5108
%in.vec.bias = load <64 x i8 >, ptr %in.vec.bias.ptr , align 64
5132
5109
%in.vec = add <64 x i8 > %in.vec.base , %in.vec.bias
@@ -5381,13 +5358,11 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
5381
5358
; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5382
5359
; AVX512BW-SLOW: # %bb.0:
5383
5360
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
5384
- ; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,11]
5385
5361
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5386
- ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5387
- ; AVX512BW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
5388
- ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5389
- ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5390
- ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5362
+ ; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5363
+ ; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0]
5364
+ ; AVX512BW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5365
+ ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
5391
5366
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
5392
5367
; AVX512BW-SLOW-NEXT: vzeroupper
5393
5368
; AVX512BW-SLOW-NEXT: retq
0 commit comments