@@ -1220,7 +1220,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
1220
1220
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1221
1221
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1222
1222
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1223
- ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
1223
+ ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
1224
1224
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1225
1225
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1226
1226
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -1234,7 +1234,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
1234
1234
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1235
1235
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1236
1236
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1237
- ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
1237
+ ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
1238
1238
; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1239
1239
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1240
1240
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1247,7 +1247,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
1247
1247
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1248
1248
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1249
1249
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1250
- ; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1
1250
+ ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
1251
1251
; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1252
1252
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1253
1253
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1259,10 +1259,9 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
1259
1259
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1260
1260
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1261
1261
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1262
- ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0
1263
1262
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
1264
1263
; AVX512BW-NEXT: kmovd %eax, %k1
1265
- ; AVX512BW-NEXT: vmovdqu8 %ymm0 , %ymm1 {%k1}
1264
+ ; AVX512BW-NEXT: vpbroadcastb %xmm0 , %ymm1 {%k1}
1266
1265
; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
1267
1266
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1268
1267
; AVX512BW-NEXT: vzeroupper
@@ -1345,7 +1344,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
1345
1344
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1346
1345
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1347
1346
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1348
- ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1347
+ ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
1349
1348
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
1350
1349
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1351
1350
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -1359,7 +1358,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
1359
1358
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1360
1359
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1361
1360
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1362
- ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
1361
+ ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
1363
1362
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1364
1363
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1365
1364
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1372,7 +1371,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
1372
1371
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1373
1372
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1374
1373
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1375
- ; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
1374
+ ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
1376
1375
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1377
1376
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1378
1377
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1384,10 +1383,9 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
1384
1383
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1385
1384
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1386
1385
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1387
- ; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0
1388
1386
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
1389
1387
; AVX512BW-NEXT: kmovd %eax, %k1
1390
- ; AVX512BW-NEXT: vmovdqu8 %ymm0 , %ymm1 {%k1}
1388
+ ; AVX512BW-NEXT: vpbroadcastb %xmm0 , %ymm1 {%k1}
1391
1389
; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
1392
1390
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1393
1391
; AVX512BW-NEXT: vzeroupper
@@ -1719,7 +1717,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
1719
1717
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1720
1718
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1721
1719
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1722
- ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1720
+ ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
1723
1721
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1724
1722
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1725
1723
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1732,7 +1730,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
1732
1730
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1733
1731
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1734
1732
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1735
- ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
1733
+ ; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1
1736
1734
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1737
1735
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1738
1736
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1745,7 +1743,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
1745
1743
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1746
1744
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1747
1745
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1748
- ; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
1746
+ ; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1
1749
1747
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1750
1748
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1751
1749
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -2691,14 +2689,13 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
2691
2689
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2692
2690
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2693
2691
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2694
- ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
2695
- ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2696
- ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2697
- ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2692
+ ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
2693
+ ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2694
+ ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
2698
2695
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2699
2696
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2700
- ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2701
2697
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2698
+ ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2702
2699
; AVX2-NEXT: vzeroupper
2703
2700
; AVX2-NEXT: retq
2704
2701
;
@@ -2708,10 +2705,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
2708
2705
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
2709
2706
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2710
2707
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2711
- ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2
2712
- ; AVX512F-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
2713
2708
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2714
- ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2709
+ ; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2710
+ ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2715
2711
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2716
2712
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2717
2713
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2724,10 +2720,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
2724
2720
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
2725
2721
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2726
2722
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2727
- ; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2
2728
- ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
2729
2723
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
2730
- ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2724
+ ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2725
+ ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2731
2726
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2732
2727
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
2733
2728
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2739,11 +2734,10 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
2739
2734
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2740
2735
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2741
2736
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
2742
- ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
2737
+ ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
2743
2738
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
2744
2739
; AVX512BW-NEXT: kmovd %eax, %k1
2745
- ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
2746
- ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
2740
+ ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
2747
2741
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2748
2742
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2749
2743
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -2959,14 +2953,13 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
2959
2953
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2960
2954
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2961
2955
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2962
- ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2
2963
- ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2964
- ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2965
- ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2956
+ ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
2957
+ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2958
+ ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
2966
2959
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2967
2960
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2968
- ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2969
2961
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2962
+ ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2970
2963
; AVX2-NEXT: vzeroupper
2971
2964
; AVX2-NEXT: retq
2972
2965
;
@@ -2976,10 +2969,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
2976
2969
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
2977
2970
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2978
2971
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2979
- ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
2980
- ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
2981
2972
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2982
- ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2973
+ ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2974
+ ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2983
2975
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2984
2976
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2985
2977
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2992,10 +2984,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
2992
2984
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
2993
2985
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2994
2986
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2995
- ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
2996
- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
2997
2987
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
2998
- ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2988
+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2989
+ ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2999
2990
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3000
2991
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3001
2992
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3007,11 +2998,10 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
3007
2998
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3008
2999
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3009
3000
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
3010
- ; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
3001
+ ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
3011
3002
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
3012
3003
; AVX512BW-NEXT: kmovd %eax, %k1
3013
- ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
3014
- ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
3004
+ ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
3015
3005
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3016
3006
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3017
3007
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -3742,14 +3732,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
3742
3732
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3743
3733
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
3744
3734
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3745
- ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2
3746
3735
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3747
- ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
3748
- ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3749
- ; AVX2-NEXT: vpaddb (%rdx), %ymm0 , %ymm0
3750
- ; AVX2-NEXT: vpaddb 32(%rdx), %ymm2 , %ymm1
3751
- ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
3752
- ; AVX2-NEXT: vmovdqa %ymm1, 32 (%rcx)
3736
+ ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
3737
+ ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3738
+ ; AVX2-NEXT: vpaddb (%rdx), %ymm1 , %ymm1
3739
+ ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0 , %ymm0
3740
+ ; AVX2-NEXT: vmovdqa %ymm0, 32 (%rcx)
3741
+ ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
3753
3742
; AVX2-NEXT: vzeroupper
3754
3743
; AVX2-NEXT: retq
3755
3744
;
@@ -3759,9 +3748,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
3759
3748
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3760
3749
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3761
3750
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3762
- ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
3763
- ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3764
3751
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
3752
+ ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3765
3753
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3766
3754
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3767
3755
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3775,9 +3763,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
3775
3763
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3776
3764
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3777
3765
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3778
- ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
3779
- ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3780
3766
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
3767
+ ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3781
3768
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3782
3769
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3783
3770
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
0 commit comments