Skip to content

Commit 3538bd0

Browse files
author
git apple-llvm automerger
committed
Merge commit '37b79e779f44' from llvm.org/release/17.x into stable/20230725
2 parents d7305d9 + 37b79e7 commit 3538bd0

File tree

6 files changed

+358
-164
lines changed

6 files changed

+358
-164
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57239,7 +57239,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5723957239
// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
5724057240
// but it currently struggles with different vector widths.
5724157241
if (llvm::all_of(Ops, [Op0](SDValue Op) {
57242-
return Op.getOpcode() == Op0.getOpcode();
57242+
return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
5724357243
})) {
5724457244
auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
5724557245
SmallVector<SDValue> Subs;

llvm/test/CodeGen/X86/pr67333.ll

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
3+
4+
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
5+
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
6+
7+
define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 {
8+
; CHECK-LABEL: SHA256_Compress_Generic:
9+
; CHECK: # %bb.0: # %entry
10+
; CHECK-NEXT: movbel 0, %eax
11+
; CHECK-NEXT: movbel 12(%rdi), %ecx
12+
; CHECK-NEXT: vmovd %eax, %xmm0
13+
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,0,1,2,3,128,128,128,128,128,128,128,128]
14+
; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm2
15+
; CHECK-NEXT: vpsrld $17, %xmm2, %xmm0
16+
; CHECK-NEXT: vpslld $15, %xmm2, %xmm3
17+
; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0
18+
; CHECK-NEXT: vpsrld $19, %xmm2, %xmm3
19+
; CHECK-NEXT: vpslld $13, %xmm2, %xmm4
20+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
21+
; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0
22+
; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0
23+
; CHECK-NEXT: vmovd %ecx, %xmm3
24+
; CHECK-NEXT: vpshufb %xmm1, %xmm3, %xmm1
25+
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1
26+
; CHECK-NEXT: vpsrld $17, %xmm1, %xmm0
27+
; CHECK-NEXT: vpslld $15, %xmm1, %xmm3
28+
; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0
29+
; CHECK-NEXT: vpsrld $19, %xmm1, %xmm3
30+
; CHECK-NEXT: vpslld $13, %xmm1, %xmm4
31+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
32+
; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0
33+
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
34+
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
35+
; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3
36+
; CHECK-NEXT: vpslld $15, %xmm0, %xmm4
37+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
38+
; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4
39+
; CHECK-NEXT: vpslld $13, %xmm0, %xmm5
40+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
41+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
42+
; CHECK-NEXT: vpsrld $10, %xmm0, %xmm0
43+
; CHECK-NEXT: vpxor %xmm0, %xmm3, %xmm0
44+
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
45+
; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3
46+
; CHECK-NEXT: vpslld $15, %xmm0, %xmm4
47+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
48+
; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4
49+
; CHECK-NEXT: vpslld $13, %xmm0, %xmm5
50+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
51+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
52+
; CHECK-NEXT: vpsrld $10, %xmm0, %xmm4
53+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
54+
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3]
55+
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,3]
56+
; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
57+
; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3
58+
; CHECK-NEXT: vpslld $15, %xmm2, %xmm4
59+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
60+
; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4
61+
; CHECK-NEXT: vpslld $13, %xmm2, %xmm5
62+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
63+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
64+
; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2
65+
; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2
66+
; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm3
67+
; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm1
68+
; CHECK-NEXT: vpsrld $17, %xmm1, %xmm2
69+
; CHECK-NEXT: vpslld $15, %xmm1, %xmm4
70+
; CHECK-NEXT: vpor %xmm2, %xmm4, %xmm2
71+
; CHECK-NEXT: vpsrld $19, %xmm1, %xmm4
72+
; CHECK-NEXT: vpslld $13, %xmm1, %xmm5
73+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
74+
; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2
75+
; CHECK-NEXT: vpsrld $10, %xmm1, %xmm4
76+
; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2
77+
; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2
78+
; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3
79+
; CHECK-NEXT: vpslld $15, %xmm2, %xmm4
80+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
81+
; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4
82+
; CHECK-NEXT: vpslld $13, %xmm2, %xmm5
83+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
84+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
85+
; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2
86+
; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2
87+
; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0
88+
; CHECK-NEXT: vpsrld $17, %xmm0, %xmm2
89+
; CHECK-NEXT: vpslld $15, %xmm0, %xmm3
90+
; CHECK-NEXT: vpor %xmm2, %xmm3, %xmm2
91+
; CHECK-NEXT: vpsrld $19, %xmm0, %xmm3
92+
; CHECK-NEXT: vpslld $13, %xmm0, %xmm4
93+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
94+
; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2
95+
; CHECK-NEXT: vpsrld $10, %xmm0, %xmm3
96+
; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2
97+
; CHECK-NEXT: vpsllq $32, %xmm1, %xmm3
98+
; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2
99+
; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
100+
; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
101+
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
102+
; CHECK-NEXT: vmovdqu %ymm0, 132(%rdi)
103+
; CHECK-NEXT: vzeroupper
104+
; CHECK-NEXT: retq
105+
entry:
106+
%0 = load i32, ptr null, align 4
107+
%1 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #3
108+
%arrayidx14 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 3
109+
%2 = load i32, ptr %arrayidx14, align 4
110+
%3 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %2) #3
111+
%4 = insertelement <2 x i32> zeroinitializer, i32 %1, i64 1
112+
%5 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 15, i32 15>)
113+
%6 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 13, i32 13>)
114+
%7 = xor <2 x i32> %5, %6
115+
%8 = lshr <2 x i32> %4, zeroinitializer
116+
%9 = xor <2 x i32> %7, %8
117+
%10 = insertelement <2 x i32> zeroinitializer, i32 %3, i64 0
118+
%11 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %10, <2 x i32> <i32 1, i32 2>
119+
%12 = add <2 x i32> %11, %9
120+
%13 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 15, i32 15>)
121+
%14 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 13, i32 13>)
122+
%15 = xor <2 x i32> %13, %14
123+
%16 = lshr <2 x i32> %12, zeroinitializer
124+
%17 = xor <2 x i32> %15, %16
125+
%18 = add <2 x i32> %4, %17
126+
%19 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 15, i32 15>)
127+
%20 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 13, i32 13>)
128+
%21 = xor <2 x i32> %19, %20
129+
%22 = lshr <2 x i32> %18, <i32 10, i32 10>
130+
%23 = xor <2 x i32> %21, %22
131+
%24 = add <2 x i32> %4, %23
132+
%25 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 15, i32 15>)
133+
%26 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 13, i32 13>)
134+
%27 = xor <2 x i32> %25, %26
135+
%28 = lshr <2 x i32> %24, <i32 10, i32 10>
136+
%29 = xor <2 x i32> %27, %28
137+
%30 = shufflevector <2 x i32> %4, <2 x i32> %12, <2 x i32> <i32 1, i32 2>
138+
%31 = add <2 x i32> %30, %29
139+
%32 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 15, i32 15>)
140+
%33 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 13, i32 13>)
141+
%34 = xor <2 x i32> %32, %33
142+
%35 = lshr <2 x i32> %31, <i32 10, i32 10>
143+
%36 = xor <2 x i32> %34, %35
144+
%37 = shufflevector <2 x i32> %12, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 2>
145+
%38 = add <2 x i32> %37, %36
146+
%arrayidx918 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 33
147+
store <2 x i32> %38, ptr %arrayidx918, align 4
148+
%arrayidx1012 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 35
149+
%39 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 15, i32 15>)
150+
%40 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 13, i32 13>)
151+
%41 = xor <2 x i32> %39, %40
152+
%42 = lshr <2 x i32> %38, <i32 10, i32 10>
153+
%43 = xor <2 x i32> %41, %42
154+
%44 = add <2 x i32> %37, %43
155+
store <2 x i32> zeroinitializer, ptr %arrayidx1012, align 4
156+
%arrayidx1106 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 37
157+
%45 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 15, i32 15>)
158+
%46 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 13, i32 13>)
159+
%47 = xor <2 x i32> %45, %46
160+
%48 = lshr <2 x i32> %44, <i32 10, i32 10>
161+
%49 = xor <2 x i32> %47, %48
162+
%50 = lshr <2 x i32> %24, zeroinitializer
163+
%51 = add <2 x i32> %50, %49
164+
store <2 x i32> %51, ptr %arrayidx1106, align 4
165+
%arrayidx1200 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 39
166+
%52 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 15, i32 15>)
167+
%53 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 13, i32 13>)
168+
%54 = xor <2 x i32> %52, %53
169+
%55 = lshr <2 x i32> %51, <i32 10, i32 10>
170+
%56 = xor <2 x i32> %54, %55
171+
%57 = shufflevector <2 x i32> %38, <2 x i32> zeroinitializer, <2 x i32> <i32 poison, i32 0>
172+
%58 = insertelement <2 x i32> %57, i32 0, i64 0
173+
%59 = add <2 x i32> %58, %56
174+
store <2 x i32> %59, ptr %arrayidx1200, align 4
175+
ret void
176+
177+
; uselistorder directives
178+
uselistorder <2 x i32> %4, { 7, 0, 1, 6, 5, 4, 3, 2 }
179+
uselistorder <2 x i32> %38, { 6, 5, 4, 3, 2, 1, 0 }
180+
}
181+
182+
declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #2
183+
184+
; uselistorder directives
185+
uselistorder ptr @llvm.fshl.v2i32, { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }
186+
187+
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
188+
attributes #1 = { nounwind sspstrong memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "probe-stack"="inline-asm" "stack-protector-buffer-size"="8" "target-cpu"="skylake" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" }
189+
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
190+
attributes #3 = { nounwind memory(none) }

llvm/test/CodeGen/X86/subvector-broadcast.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1768,8 +1768,8 @@ define void @PR51226() {
17681768
; X86-AVX2-LABEL: PR51226:
17691769
; X86-AVX2: # %bb.0:
17701770
; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1771+
; X86-AVX2-NEXT: vpslld $16, %xmm0, %xmm0
17711772
; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1772-
; X86-AVX2-NEXT: vpslld $16, %ymm0, %ymm0
17731773
; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
17741774
; X86-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0
17751775
; X86-AVX2-NEXT: vmovups %ymm0, (%eax)
@@ -1779,8 +1779,8 @@ define void @PR51226() {
17791779
; X86-AVX512-LABEL: PR51226:
17801780
; X86-AVX512: # %bb.0:
17811781
; X86-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1782+
; X86-AVX512-NEXT: vpslld $16, %xmm0, %xmm0
17821783
; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1783-
; X86-AVX512-NEXT: vpslld $16, %ymm0, %ymm0
17841784
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
17851785
; X86-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0
17861786
; X86-AVX512-NEXT: vmovups %ymm0, (%eax)
@@ -1801,8 +1801,8 @@ define void @PR51226() {
18011801
; X64-AVX2-LABEL: PR51226:
18021802
; X64-AVX2: # %bb.0:
18031803
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1804+
; X64-AVX2-NEXT: vpslld $16, %xmm0, %xmm0
18041805
; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1805-
; X64-AVX2-NEXT: vpslld $16, %ymm0, %ymm0
18061806
; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
18071807
; X64-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0
18081808
; X64-AVX2-NEXT: vmovups %ymm0, (%rax)
@@ -1812,8 +1812,8 @@ define void @PR51226() {
18121812
; X64-AVX512-LABEL: PR51226:
18131813
; X64-AVX512: # %bb.0:
18141814
; X64-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1815+
; X64-AVX512-NEXT: vpslld $16, %xmm0, %xmm0
18151816
; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1816-
; X64-AVX512-NEXT: vpslld $16, %ymm0, %ymm0
18171817
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
18181818
; X64-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0
18191819
; X64-AVX512-NEXT: vmovups %ymm0, (%rax)

llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -726,17 +726,19 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
726726
; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
727727
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
728728
; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
729-
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
730-
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7]
729+
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
730+
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
731731
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
732-
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
732+
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
733+
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
733734
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
734735
; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm0
735736
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1
736-
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
737-
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
738-
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
739-
; AVX512BW-SLOW-NEXT: vporq %zmm2, %zmm1, %zmm1
737+
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
738+
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
739+
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
740+
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
741+
; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm1
740742
; AVX512BW-SLOW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
741743
; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1
742744
; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}

0 commit comments

Comments
 (0)