@@ -910,8 +910,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
910
910
; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
911
911
; AVX2: # %bb.0:
912
912
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
913
- ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
914
913
; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
914
+ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
915
915
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
916
916
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
917
917
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -920,32 +920,32 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
920
920
;
921
921
; AVX512F-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
922
922
; AVX512F: # %bb.0:
923
- ; AVX512F-NEXT: vmovdqa 32 (%rdi), %ymm0
924
- ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
925
- ; AVX512F-NEXT: vpbroadcastb (%rdi), % ymm1
926
- ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0], ymm0[0],ymm1[1 ],ymm0[1],ymm1[2 ],ymm0[2],ymm1[3 ],ymm0[3],ymm1[4 ],ymm0[4],ymm1[5 ],ymm0[5],ymm1[6 ],ymm0[6],ymm1[7 ],ymm0[7],ymm1[16 ],ymm0[16],ymm1[17 ],ymm0[17],ymm1[18 ],ymm0[18],ymm1[19 ],ymm0[19],ymm1[20 ],ymm0[20],ymm1[21 ],ymm0[21],ymm1[22 ],ymm0[22],ymm1[23 ],ymm0[23]
923
+ ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
924
+ ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
925
+ ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
926
+ ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0 ],ymm0[1],ymm1[1 ],ymm0[2],ymm1[2 ],ymm0[3],ymm1[3 ],ymm0[4],ymm1[4 ],ymm0[5],ymm1[5 ],ymm0[6],ymm1[6 ],ymm0[7],ymm1[7 ],ymm0[16],ymm1[16 ],ymm0[17],ymm1[17 ],ymm0[18],ymm1[18 ],ymm0[19],ymm1[19 ],ymm0[20],ymm1[20 ],ymm0[21],ymm1[21 ],ymm0[22],ymm1[22 ],ymm0[23],ymm1 [23]
927
927
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
928
928
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
929
929
; AVX512F-NEXT: vzeroupper
930
930
; AVX512F-NEXT: retq
931
931
;
932
932
; AVX512DQ-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
933
933
; AVX512DQ: # %bb.0:
934
- ; AVX512DQ-NEXT: vmovdqa 32 (%rdi), %ymm0
935
- ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
936
- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), % ymm1
937
- ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0], ymm0[0],ymm1[1 ],ymm0[1],ymm1[2 ],ymm0[2],ymm1[3 ],ymm0[3],ymm1[4 ],ymm0[4],ymm1[5 ],ymm0[5],ymm1[6 ],ymm0[6],ymm1[7 ],ymm0[7],ymm1[16 ],ymm0[16],ymm1[17 ],ymm0[17],ymm1[18 ],ymm0[18],ymm1[19 ],ymm0[19],ymm1[20 ],ymm0[20],ymm1[21 ],ymm0[21],ymm1[22 ],ymm0[22],ymm1[23 ],ymm0[23]
934
+ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
935
+ ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
936
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
937
+ ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0 ],ymm0[1],ymm1[1 ],ymm0[2],ymm1[2 ],ymm0[3],ymm1[3 ],ymm0[4],ymm1[4 ],ymm0[5],ymm1[5 ],ymm0[6],ymm1[6 ],ymm0[7],ymm1[7 ],ymm0[16],ymm1[16 ],ymm0[17],ymm1[17 ],ymm0[18],ymm1[18 ],ymm0[19],ymm1[19 ],ymm0[20],ymm1[20 ],ymm0[21],ymm1[21 ],ymm0[22],ymm1[22 ],ymm0[23],ymm1 [23]
938
938
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
939
939
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
940
940
; AVX512DQ-NEXT: vzeroupper
941
941
; AVX512DQ-NEXT: retq
942
942
;
943
943
; AVX512BW-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
944
944
; AVX512BW: # %bb.0:
945
- ; AVX512BW-NEXT: vmovdqa 32 (%rdi), %ymm0
946
- ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
947
- ; AVX512BW-NEXT: vpbroadcastb (%rdi), % ymm1
948
- ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0], ymm0[0],ymm1[1 ],ymm0[1],ymm1[2 ],ymm0[2],ymm1[3 ],ymm0[3],ymm1[4 ],ymm0[4],ymm1[5 ],ymm0[5],ymm1[6 ],ymm0[6],ymm1[7 ],ymm0[7],ymm1[16 ],ymm0[16],ymm1[17 ],ymm0[17],ymm1[18 ],ymm0[18],ymm1[19 ],ymm0[19],ymm1[20 ],ymm0[20],ymm1[21 ],ymm0[21],ymm1[22 ],ymm0[22],ymm1[23 ],ymm0[23]
945
+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0
946
+ ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
947
+ ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
948
+ ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0 ],ymm0[1],ymm1[1 ],ymm0[2],ymm1[2 ],ymm0[3],ymm1[3 ],ymm0[4],ymm1[4 ],ymm0[5],ymm1[5 ],ymm0[6],ymm1[6 ],ymm0[7],ymm1[7 ],ymm0[16],ymm1[16 ],ymm0[17],ymm1[17 ],ymm0[18],ymm1[18 ],ymm0[19],ymm1[19 ],ymm0[20],ymm1[20 ],ymm0[21],ymm1[21 ],ymm0[22],ymm1[22 ],ymm0[23],ymm1 [23]
949
949
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
950
950
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
951
951
; AVX512BW-NEXT: vzeroupper
@@ -1906,12 +1906,10 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
1906
1906
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1907
1907
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
1908
1908
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1909
- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
1910
- ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
1911
- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1912
1909
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1913
- ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
1910
+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1914
1911
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
1912
+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
1915
1913
; AVX512F-NEXT: vzeroupper
1916
1914
; AVX512F-NEXT: retq
1917
1915
;
@@ -1921,12 +1919,10 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
1921
1919
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1922
1920
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
1923
1921
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1924
- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
1925
- ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
1926
- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1927
1922
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1928
- ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
1923
+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1929
1924
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
1925
+ ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
1930
1926
; AVX512DQ-NEXT: vzeroupper
1931
1927
; AVX512DQ-NEXT: retq
1932
1928
;
@@ -1936,7 +1932,6 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
1936
1932
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1937
1933
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
1938
1934
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1939
- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
1940
1935
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1941
1936
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1942
1937
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2133,37 +2128,33 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
2133
2128
; AVX512F: # %bb.0:
2134
2129
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
2135
2130
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2136
- ; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
2137
- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2138
- ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2139
- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2140
- ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2141
- ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
2142
- ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2131
+ ; AVX512F-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
2132
+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2133
+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2134
+ ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2135
+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
2143
2136
; AVX512F-NEXT: vzeroupper
2144
2137
; AVX512F-NEXT: retq
2145
2138
;
2146
2139
; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2147
2140
; AVX512DQ: # %bb.0:
2148
2141
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
2149
2142
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2150
- ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
2151
- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2152
- ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2153
- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2154
- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2155
- ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
2156
- ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2143
+ ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
2144
+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2145
+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2146
+ ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2147
+ ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
2157
2148
; AVX512DQ-NEXT: vzeroupper
2158
2149
; AVX512DQ-NEXT: retq
2159
2150
;
2160
2151
; AVX512BW-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2161
2152
; AVX512BW: # %bb.0:
2162
2153
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
2154
+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2163
2155
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
2164
2156
; AVX512BW-NEXT: kmovd %eax, %k1
2165
- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
2166
- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2157
+ ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
2167
2158
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2168
2159
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2169
2160
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2364,37 +2355,33 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
2364
2355
; AVX512F: # %bb.0:
2365
2356
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
2366
2357
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
2367
- ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
2368
- ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
2369
- ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2370
- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2371
- ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2372
- ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
2373
- ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2358
+ ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
2359
+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2360
+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2361
+ ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2362
+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
2374
2363
; AVX512F-NEXT: vzeroupper
2375
2364
; AVX512F-NEXT: retq
2376
2365
;
2377
2366
; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2378
2367
; AVX512DQ: # %bb.0:
2379
2368
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
2380
2369
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
2381
- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
2382
- ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
2383
- ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2384
- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2385
- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2386
- ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
2387
- ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2370
+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
2371
+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2372
+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2373
+ ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2374
+ ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
2388
2375
; AVX512DQ-NEXT: vzeroupper
2389
2376
; AVX512DQ-NEXT: retq
2390
2377
;
2391
2378
; AVX512BW-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2392
2379
; AVX512BW: # %bb.0:
2393
2380
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
2381
+ ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2394
2382
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
2395
2383
; AVX512BW-NEXT: kmovd %eax, %k1
2396
- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
2397
- ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
2384
+ ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
2398
2385
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2399
2386
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2400
2387
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2817,7 +2804,6 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
2817
2804
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
2818
2805
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
2819
2806
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
2820
- ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
2821
2807
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2822
2808
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2823
2809
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2830,7 +2816,6 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
2830
2816
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
2831
2817
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
2832
2818
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
2833
- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
2834
2819
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2835
2820
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2836
2821
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2907,8 +2892,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
2907
2892
; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
2908
2893
; AVX2: # %bb.0:
2909
2894
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
2910
- ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1
2911
- ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2],xmm1[3],mem[4,5],xmm1[6],mem[7]
2895
+ ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
2912
2896
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2913
2897
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2914
2898
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2920,26 +2904,24 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
2920
2904
; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
2921
2905
; AVX512F: # %bb.0:
2922
2906
; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
2923
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
2924
- ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1
2925
- ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2926
- ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2927
- ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2928
- ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
2929
- ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
2907
+ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
2908
+ ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2909
+ ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2910
+ ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2911
+ ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
2912
+ ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
2930
2913
; AVX512F-NEXT: vzeroupper
2931
2914
; AVX512F-NEXT: retq
2932
2915
;
2933
2916
; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
2934
2917
; AVX512DQ: # %bb.0:
2935
2918
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
2936
- ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
2937
- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1
2938
- ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2939
- ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2940
- ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2941
- ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
2942
- ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
2919
+ ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
2920
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2921
+ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
2922
+ ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2923
+ ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
2924
+ ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
2943
2925
; AVX512DQ-NEXT: vzeroupper
2944
2926
; AVX512DQ-NEXT: retq
2945
2927
;
@@ -3024,7 +3006,6 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
3024
3006
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
3025
3007
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
3026
3008
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
3027
- ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
3028
3009
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3029
3010
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3030
3011
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -3037,7 +3018,6 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
3037
3018
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
3038
3019
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
3039
3020
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
3040
- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
3041
3021
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3042
3022
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3043
3023
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -3117,8 +3097,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
3117
3097
; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3118
3098
; AVX2: # %bb.0:
3119
3099
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
3120
- ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1
3121
- ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
3100
+ ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3122
3101
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3123
3102
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
3124
3103
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -3130,9 +3109,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
3130
3109
; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3131
3110
; AVX512F: # %bb.0:
3132
3111
; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
3133
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3134
- ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1
3135
- ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3112
+ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3113
+ ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3136
3114
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
3137
3115
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3138
3116
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
@@ -3144,9 +3122,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
3144
3122
; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3145
3123
; AVX512DQ: # %bb.0:
3146
3124
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
3147
- ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3148
- ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1
3149
- ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3125
+ ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
3126
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3150
3127
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
3151
3128
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3152
3129
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
0 commit comments