@@ -1888,15 +1888,14 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
1888
1888
;
1889
1889
; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
1890
1890
; AVX2: # %bb.0:
1891
- ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
1892
- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
1893
- ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1894
- ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
1895
- ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
1896
- ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
1897
- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1898
- ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
1899
- ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
1891
+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
1892
+ ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1893
+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
1894
+ ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1895
+ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1896
+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1897
+ ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
1898
+ ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
1900
1899
; AVX2-NEXT: vzeroupper
1901
1900
; AVX2-NEXT: retq
1902
1901
;
@@ -2112,15 +2111,14 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
2112
2111
;
2113
2112
; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2114
2113
; AVX2: # %bb.0:
2115
- ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
2116
- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2117
- ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
2118
- ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2119
- ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2120
- ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2121
- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2122
- ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2123
- ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2114
+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2115
+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
2116
+ ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2117
+ ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2118
+ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2119
+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2120
+ ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2121
+ ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2124
2122
; AVX2-NEXT: vzeroupper
2125
2123
; AVX2-NEXT: retq
2126
2124
;
@@ -2237,33 +2235,29 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
2237
2235
;
2238
2236
; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2239
2237
; AVX512F: # %bb.0:
2240
- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2241
- ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2242
- ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
2243
- ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2244
- ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2245
- ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2
2246
- ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2247
- ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2248
- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2249
- ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2250
- ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
2238
+ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2239
+ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2240
+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2241
+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2242
+ ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2243
+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2244
+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2245
+ ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2246
+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
2251
2247
; AVX512F-NEXT: vzeroupper
2252
2248
; AVX512F-NEXT: retq
2253
2249
;
2254
2250
; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2255
2251
; AVX512DQ: # %bb.0:
2256
- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2257
- ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2258
- ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
2259
- ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2260
- ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2261
- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2
2262
- ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2263
- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2264
- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2265
- ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2266
- ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
2252
+ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2253
+ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2254
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2255
+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2256
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2257
+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2258
+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2259
+ ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2260
+ ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
2267
2261
; AVX512DQ-NEXT: vzeroupper
2268
2262
; AVX512DQ-NEXT: retq
2269
2263
;
@@ -2272,9 +2266,8 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
2272
2266
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
2273
2267
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2274
2268
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2275
- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1
2276
- ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2277
2269
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2270
+ ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2278
2271
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2279
2272
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2280
2273
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2339,15 +2332,14 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
2339
2332
;
2340
2333
; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2341
2334
; AVX2: # %bb.0:
2342
- ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
2343
- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2344
- ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
2345
- ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2346
- ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2347
- ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2348
- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2349
- ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2350
- ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2335
+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2336
+ ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
2337
+ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2338
+ ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2339
+ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2340
+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2341
+ ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2342
+ ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2351
2343
; AVX2-NEXT: vzeroupper
2352
2344
; AVX2-NEXT: retq
2353
2345
;
@@ -2462,33 +2454,29 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
2462
2454
;
2463
2455
; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
2464
2456
; AVX512F: # %bb.0:
2465
- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2466
- ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2467
- ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
2468
- ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2469
- ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2470
- ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2
2471
- ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2472
- ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2473
- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2474
- ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2475
- ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
2457
+ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2458
+ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2459
+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2460
+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2461
+ ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2462
+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2463
+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2464
+ ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2465
+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
2476
2466
; AVX512F-NEXT: vzeroupper
2477
2467
; AVX512F-NEXT: retq
2478
2468
;
2479
2469
; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
2480
2470
; AVX512DQ: # %bb.0:
2481
- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2482
- ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2483
- ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
2484
- ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
2485
- ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2486
- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2
2487
- ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2488
- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2489
- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2490
- ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2491
- ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
2471
+ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2472
+ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2473
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2474
+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2475
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2476
+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2477
+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2478
+ ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2479
+ ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
2492
2480
; AVX512DQ-NEXT: vzeroupper
2493
2481
; AVX512DQ-NEXT: retq
2494
2482
;
@@ -2497,9 +2485,8 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
2497
2485
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
2498
2486
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2499
2487
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
2500
- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1
2501
- ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2502
2488
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2489
+ ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2503
2490
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2504
2491
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2505
2492
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2788,14 +2775,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
2788
2775
;
2789
2776
; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
2790
2777
; AVX2: # %bb.0:
2791
- ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
2792
- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2793
- ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
2794
- ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
2795
- ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2796
- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2797
- ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
2798
- ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2778
+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2779
+ ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
2780
+ ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
2781
+ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2782
+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2783
+ ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2784
+ ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2799
2785
; AVX2-NEXT: vzeroupper
2800
2786
; AVX2-NEXT: retq
2801
2787
;
@@ -2990,14 +2976,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
2990
2976
;
2991
2977
; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
2992
2978
; AVX2: # %bb.0:
2993
- ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
2994
- ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2995
- ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
2996
- ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
2997
- ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2998
- ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2999
- ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
3000
- ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2979
+ ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
2980
+ ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
2981
+ ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
2982
+ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2983
+ ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2984
+ ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
2985
+ ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
3001
2986
; AVX2-NEXT: vzeroupper
3002
2987
; AVX2-NEXT: retq
3003
2988
;
@@ -3108,27 +3093,25 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
3108
3093
;
3109
3094
; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3110
3095
; AVX512F: # %bb.0:
3111
- ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
3096
+ ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0
3112
3097
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3113
- ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3114
- ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
3115
- ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3116
- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3117
- ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
3118
- ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
3098
+ ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3099
+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
3100
+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3101
+ ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
3102
+ ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
3119
3103
; AVX512F-NEXT: vzeroupper
3120
3104
; AVX512F-NEXT: retq
3121
3105
;
3122
3106
; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3123
3107
; AVX512DQ: # %bb.0:
3124
- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
3108
+ ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0
3125
3109
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3126
- ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3127
- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
3128
- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3129
- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3130
- ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
3131
- ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
3110
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3111
+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
3112
+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3113
+ ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
3114
+ ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
3132
3115
; AVX512DQ-NEXT: vzeroupper
3133
3116
; AVX512DQ-NEXT: retq
3134
3117
;
0 commit comments