@@ -1888,14 +1888,15 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
1888
1888
;
1889
1889
; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
1890
1890
; AVX2: # %bb.0:
1891
- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
1892
- ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1893
- ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
1894
- ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1895
- ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1896
- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1897
- ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
1898
- ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
1891
+ ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
1892
+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
1893
+ ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1894
+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
1895
+ ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
1896
+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
1897
+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1898
+ ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
1899
+ ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
1899
1900
; AVX2-NEXT: vzeroupper
1900
1901
; AVX2-NEXT: retq
1901
1902
;
@@ -2111,14 +2112,15 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
2111
2112
;
2112
2113
; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2113
2114
; AVX2: # %bb.0:
2114
- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2115
- ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
2116
- ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2117
- ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2118
- ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2119
- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2120
- ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2121
- ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2115
+ ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
2116
+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2117
+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
2118
+ ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2119
+ ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2120
+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2121
+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2122
+ ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2123
+ ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2122
2124
; AVX2-NEXT: vzeroupper
2123
2125
; AVX2-NEXT: retq
2124
2126
;
@@ -2235,29 +2237,33 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
2235
2237
;
2236
2238
; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2237
2239
; AVX512F: # %bb.0:
2238
- ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2239
- ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2240
- ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2241
- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2242
- ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2243
- ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2244
- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2245
- ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2246
- ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
2240
+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2241
+ ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2242
+ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
2243
+ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2244
+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2245
+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2
2246
+ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2247
+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2248
+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2249
+ ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2250
+ ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
2247
2251
; AVX512F-NEXT: vzeroupper
2248
2252
; AVX512F-NEXT: retq
2249
2253
;
2250
2254
; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2251
2255
; AVX512DQ: # %bb.0:
2252
- ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2253
- ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2254
- ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2255
- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2256
- ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2257
- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2258
- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2259
- ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2260
- ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
2256
+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2257
+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2258
+ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
2259
+ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2260
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2261
+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2
2262
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2263
+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2264
+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2265
+ ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2266
+ ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
2261
2267
; AVX512DQ-NEXT: vzeroupper
2262
2268
; AVX512DQ-NEXT: retq
2263
2269
;
@@ -2266,8 +2272,9 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
2266
2272
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
2267
2273
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2268
2274
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2269
- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2275
+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1
2270
2276
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2277
+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2271
2278
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2272
2279
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2273
2280
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2332,14 +2339,15 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
2332
2339
;
2333
2340
; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2334
2341
; AVX2: # %bb.0:
2335
- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2336
- ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
2337
- ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2338
- ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2339
- ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2340
- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2341
- ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2342
- ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2342
+ ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
2343
+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2344
+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
2345
+ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2346
+ ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2347
+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2348
+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2349
+ ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2350
+ ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2343
2351
; AVX2-NEXT: vzeroupper
2344
2352
; AVX2-NEXT: retq
2345
2353
;
@@ -2454,29 +2462,33 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
2454
2462
;
2455
2463
; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
2456
2464
; AVX512F: # %bb.0:
2457
- ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2458
- ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2459
- ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2460
- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2461
- ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2462
- ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2463
- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2464
- ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2465
- ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
2465
+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2466
+ ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2467
+ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
2468
+ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2469
+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2470
+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2
2471
+ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2472
+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2473
+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2474
+ ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2475
+ ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
2466
2476
; AVX512F-NEXT: vzeroupper
2467
2477
; AVX512F-NEXT: retq
2468
2478
;
2469
2479
; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
2470
2480
; AVX512DQ: # %bb.0:
2471
- ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2472
- ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2473
- ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2474
- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2475
- ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2476
- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2477
- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2478
- ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2479
- ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
2481
+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2482
+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2483
+ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
2484
+ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2485
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2486
+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2
2487
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2488
+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2489
+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2490
+ ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2491
+ ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
2480
2492
; AVX512DQ-NEXT: vzeroupper
2481
2493
; AVX512DQ-NEXT: retq
2482
2494
;
@@ -2485,8 +2497,9 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
2485
2497
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
2486
2498
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2487
2499
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2488
- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2500
+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1
2489
2501
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2502
+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2490
2503
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2491
2504
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2492
2505
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2775,13 +2788,14 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
2775
2788
;
2776
2789
; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
2777
2790
; AVX2: # %bb.0:
2778
- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2779
- ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
2780
- ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
2781
- ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2782
- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2783
- ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2784
- ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2791
+ ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
2792
+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2793
+ ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
2794
+ ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
2795
+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2796
+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2797
+ ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2798
+ ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2785
2799
; AVX2-NEXT: vzeroupper
2786
2800
; AVX2-NEXT: retq
2787
2801
;
@@ -2976,13 +2990,14 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
2976
2990
;
2977
2991
; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
2978
2992
; AVX2: # %bb.0:
2979
- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2980
- ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
2981
- ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
2982
- ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2983
- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2984
- ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2985
- ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2993
+ ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
2994
+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2995
+ ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
2996
+ ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
2997
+ ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2998
+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2999
+ ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
3000
+ ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2986
3001
; AVX2-NEXT: vzeroupper
2987
3002
; AVX2-NEXT: retq
2988
3003
;
@@ -3093,25 +3108,27 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
3093
3108
;
3094
3109
; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3095
3110
; AVX512F: # %bb.0:
3096
- ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0
3111
+ ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
3097
3112
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3098
- ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3099
- ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
3100
- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3101
- ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
3102
- ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
3113
+ ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3114
+ ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
3115
+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3116
+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3117
+ ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
3118
+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
3103
3119
; AVX512F-NEXT: vzeroupper
3104
3120
; AVX512F-NEXT: retq
3105
3121
;
3106
3122
; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3107
3123
; AVX512DQ: # %bb.0:
3108
- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0
3124
+ ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
3109
3125
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3110
- ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3111
- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
3112
- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3113
- ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
3114
- ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
3126
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3127
+ ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
3128
+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3129
+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3130
+ ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
3131
+ ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
3115
3132
; AVX512DQ-NEXT: vzeroupper
3116
3133
; AVX512DQ-NEXT: retq
3117
3134
;
0 commit comments