Skip to content

Commit 04b403d

Browse files
committed
[X86] combineConcatVectorOps - only concatenate single-use subops
We could maybe extend this by allowing the lowest subop to have multiple uses and extract the lowest subvector result of the concatenated op, but let's just get the fix in first. Fixes #67333
1 parent c1ce21b commit 04b403d

File tree

6 files changed

+360
-112
lines changed

6 files changed

+360
-112
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54448,7 +54448,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5444854448
// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
5444954449
// but it currently struggles with different vector widths.
5445054450
if (llvm::all_of(Ops, [Op0](SDValue Op) {
54451-
return Op.getOpcode() == Op0.getOpcode();
54451+
return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
5445254452
})) {
5445354453
auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
5445454454
SmallVector<SDValue> Subs;

llvm/test/CodeGen/X86/pr67333.ll

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
3+
4+
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
5+
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
6+
7+
define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 {
8+
; CHECK-LABEL: SHA256_Compress_Generic:
9+
; CHECK: # %bb.0: # %entry
10+
; CHECK-NEXT: movbel 0, %eax
11+
; CHECK-NEXT: movbel 12(%rdi), %ecx
12+
; CHECK-NEXT: vmovd %eax, %xmm0
13+
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,0,1,2,3,128,128,128,128,128,128,128,128]
14+
; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm2
15+
; CHECK-NEXT: vpsrld $17, %xmm2, %xmm0
16+
; CHECK-NEXT: vpslld $15, %xmm2, %xmm3
17+
; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0
18+
; CHECK-NEXT: vpsrld $19, %xmm2, %xmm3
19+
; CHECK-NEXT: vpslld $13, %xmm2, %xmm4
20+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
21+
; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0
22+
; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0
23+
; CHECK-NEXT: vmovd %ecx, %xmm3
24+
; CHECK-NEXT: vpshufb %xmm1, %xmm3, %xmm1
25+
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1
26+
; CHECK-NEXT: vpsrld $17, %xmm1, %xmm0
27+
; CHECK-NEXT: vpslld $15, %xmm1, %xmm3
28+
; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0
29+
; CHECK-NEXT: vpsrld $19, %xmm1, %xmm3
30+
; CHECK-NEXT: vpslld $13, %xmm1, %xmm4
31+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
32+
; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0
33+
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
34+
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
35+
; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3
36+
; CHECK-NEXT: vpslld $15, %xmm0, %xmm4
37+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
38+
; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4
39+
; CHECK-NEXT: vpslld $13, %xmm0, %xmm5
40+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
41+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
42+
; CHECK-NEXT: vpsrld $10, %xmm0, %xmm0
43+
; CHECK-NEXT: vpxor %xmm0, %xmm3, %xmm0
44+
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
45+
; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3
46+
; CHECK-NEXT: vpslld $15, %xmm0, %xmm4
47+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
48+
; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4
49+
; CHECK-NEXT: vpslld $13, %xmm0, %xmm5
50+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
51+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
52+
; CHECK-NEXT: vpsrld $10, %xmm0, %xmm4
53+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
54+
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3]
55+
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,3]
56+
; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
57+
; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3
58+
; CHECK-NEXT: vpslld $15, %xmm2, %xmm4
59+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
60+
; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4
61+
; CHECK-NEXT: vpslld $13, %xmm2, %xmm5
62+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
63+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
64+
; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2
65+
; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2
66+
; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm3
67+
; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm1
68+
; CHECK-NEXT: vpsrld $17, %xmm1, %xmm2
69+
; CHECK-NEXT: vpslld $15, %xmm1, %xmm4
70+
; CHECK-NEXT: vpor %xmm2, %xmm4, %xmm2
71+
; CHECK-NEXT: vpsrld $19, %xmm1, %xmm4
72+
; CHECK-NEXT: vpslld $13, %xmm1, %xmm5
73+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
74+
; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2
75+
; CHECK-NEXT: vpsrld $10, %xmm1, %xmm4
76+
; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2
77+
; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2
78+
; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3
79+
; CHECK-NEXT: vpslld $15, %xmm2, %xmm4
80+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
81+
; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4
82+
; CHECK-NEXT: vpslld $13, %xmm2, %xmm5
83+
; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4
84+
; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3
85+
; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2
86+
; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2
87+
; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0
88+
; CHECK-NEXT: vpsrld $17, %xmm0, %xmm2
89+
; CHECK-NEXT: vpslld $15, %xmm0, %xmm3
90+
; CHECK-NEXT: vpor %xmm2, %xmm3, %xmm2
91+
; CHECK-NEXT: vpsrld $19, %xmm0, %xmm3
92+
; CHECK-NEXT: vpslld $13, %xmm0, %xmm4
93+
; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3
94+
; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2
95+
; CHECK-NEXT: vpsrld $10, %xmm0, %xmm3
96+
; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2
97+
; CHECK-NEXT: vpsllq $32, %xmm1, %xmm3
98+
; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2
99+
; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
100+
; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
101+
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
102+
; CHECK-NEXT: vmovdqu %ymm0, 132(%rdi)
103+
; CHECK-NEXT: vzeroupper
104+
; CHECK-NEXT: retq
105+
entry:
106+
%0 = load i32, ptr null, align 4
107+
%1 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #3
108+
%arrayidx14 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 3
109+
%2 = load i32, ptr %arrayidx14, align 4
110+
%3 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %2) #3
111+
%4 = insertelement <2 x i32> zeroinitializer, i32 %1, i64 1
112+
%5 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 15, i32 15>)
113+
%6 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 13, i32 13>)
114+
%7 = xor <2 x i32> %5, %6
115+
%8 = lshr <2 x i32> %4, zeroinitializer
116+
%9 = xor <2 x i32> %7, %8
117+
%10 = insertelement <2 x i32> zeroinitializer, i32 %3, i64 0
118+
%11 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %10, <2 x i32> <i32 1, i32 2>
119+
%12 = add <2 x i32> %11, %9
120+
%13 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 15, i32 15>)
121+
%14 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 13, i32 13>)
122+
%15 = xor <2 x i32> %13, %14
123+
%16 = lshr <2 x i32> %12, zeroinitializer
124+
%17 = xor <2 x i32> %15, %16
125+
%18 = add <2 x i32> %4, %17
126+
%19 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 15, i32 15>)
127+
%20 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 13, i32 13>)
128+
%21 = xor <2 x i32> %19, %20
129+
%22 = lshr <2 x i32> %18, <i32 10, i32 10>
130+
%23 = xor <2 x i32> %21, %22
131+
%24 = add <2 x i32> %4, %23
132+
%25 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 15, i32 15>)
133+
%26 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 13, i32 13>)
134+
%27 = xor <2 x i32> %25, %26
135+
%28 = lshr <2 x i32> %24, <i32 10, i32 10>
136+
%29 = xor <2 x i32> %27, %28
137+
%30 = shufflevector <2 x i32> %4, <2 x i32> %12, <2 x i32> <i32 1, i32 2>
138+
%31 = add <2 x i32> %30, %29
139+
%32 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 15, i32 15>)
140+
%33 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 13, i32 13>)
141+
%34 = xor <2 x i32> %32, %33
142+
%35 = lshr <2 x i32> %31, <i32 10, i32 10>
143+
%36 = xor <2 x i32> %34, %35
144+
%37 = shufflevector <2 x i32> %12, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 2>
145+
%38 = add <2 x i32> %37, %36
146+
%arrayidx918 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 33
147+
store <2 x i32> %38, ptr %arrayidx918, align 4
148+
%arrayidx1012 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 35
149+
%39 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 15, i32 15>)
150+
%40 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 13, i32 13>)
151+
%41 = xor <2 x i32> %39, %40
152+
%42 = lshr <2 x i32> %38, <i32 10, i32 10>
153+
%43 = xor <2 x i32> %41, %42
154+
%44 = add <2 x i32> %37, %43
155+
store <2 x i32> zeroinitializer, ptr %arrayidx1012, align 4
156+
%arrayidx1106 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 37
157+
%45 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 15, i32 15>)
158+
%46 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 13, i32 13>)
159+
%47 = xor <2 x i32> %45, %46
160+
%48 = lshr <2 x i32> %44, <i32 10, i32 10>
161+
%49 = xor <2 x i32> %47, %48
162+
%50 = lshr <2 x i32> %24, zeroinitializer
163+
%51 = add <2 x i32> %50, %49
164+
store <2 x i32> %51, ptr %arrayidx1106, align 4
165+
%arrayidx1200 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 39
166+
%52 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 15, i32 15>)
167+
%53 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 13, i32 13>)
168+
%54 = xor <2 x i32> %52, %53
169+
%55 = lshr <2 x i32> %51, <i32 10, i32 10>
170+
%56 = xor <2 x i32> %54, %55
171+
%57 = shufflevector <2 x i32> %38, <2 x i32> zeroinitializer, <2 x i32> <i32 poison, i32 0>
172+
%58 = insertelement <2 x i32> %57, i32 0, i64 0
173+
%59 = add <2 x i32> %58, %56
174+
store <2 x i32> %59, ptr %arrayidx1200, align 4
175+
ret void
176+
177+
; uselistorder directives
178+
uselistorder <2 x i32> %4, { 7, 0, 1, 6, 5, 4, 3, 2 }
179+
uselistorder <2 x i32> %38, { 6, 5, 4, 3, 2, 1, 0 }
180+
}
181+
182+
declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #2
183+
184+
; uselistorder directives
185+
uselistorder ptr @llvm.fshl.v2i32, { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }
186+
187+
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
188+
attributes #1 = { nounwind sspstrong memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "probe-stack"="inline-asm" "stack-protector-buffer-size"="8" "target-cpu"="skylake" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" }
189+
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
190+
attributes #3 = { nounwind memory(none) }

llvm/test/CodeGen/X86/subvector-broadcast.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1756,8 +1756,8 @@ define void @PR51226() {
17561756
; X86-AVX2-LABEL: PR51226:
17571757
; X86-AVX2: # %bb.0:
17581758
; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1759+
; X86-AVX2-NEXT: vpslld $16, %xmm0, %xmm0
17591760
; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1760-
; X86-AVX2-NEXT: vpslld $16, %ymm0, %ymm0
17611761
; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
17621762
; X86-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0
17631763
; X86-AVX2-NEXT: vmovups %ymm0, (%eax)
@@ -1767,8 +1767,8 @@ define void @PR51226() {
17671767
; X86-AVX512-LABEL: PR51226:
17681768
; X86-AVX512: # %bb.0:
17691769
; X86-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1770+
; X86-AVX512-NEXT: vpslld $16, %xmm0, %xmm0
17701771
; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1771-
; X86-AVX512-NEXT: vpslld $16, %ymm0, %ymm0
17721772
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
17731773
; X86-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0
17741774
; X86-AVX512-NEXT: vmovups %ymm0, (%eax)
@@ -1789,8 +1789,8 @@ define void @PR51226() {
17891789
; X64-AVX2-LABEL: PR51226:
17901790
; X64-AVX2: # %bb.0:
17911791
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1792+
; X64-AVX2-NEXT: vpslld $16, %xmm0, %xmm0
17921793
; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1793-
; X64-AVX2-NEXT: vpslld $16, %ymm0, %ymm0
17941794
; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
17951795
; X64-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0
17961796
; X64-AVX2-NEXT: vmovups %ymm0, (%rax)
@@ -1800,8 +1800,8 @@ define void @PR51226() {
18001800
; X64-AVX512-LABEL: PR51226:
18011801
; X64-AVX512: # %bb.0:
18021802
; X64-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1803+
; X64-AVX512-NEXT: vpslld $16, %xmm0, %xmm0
18031804
; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1804-
; X64-AVX512-NEXT: vpslld $16, %ymm0, %ymm0
18051805
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
18061806
; X64-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0
18071807
; X64-AVX512-NEXT: vmovups %ymm0, (%rax)

llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -725,17 +725,19 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
725725
; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
726726
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
727727
; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
728-
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
729-
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7]
728+
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
729+
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
730730
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
731-
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
731+
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
732+
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
732733
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
733734
; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm0
734735
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1
735-
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
736-
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
737-
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
738-
; AVX512BW-SLOW-NEXT: vporq %zmm2, %zmm1, %zmm1
736+
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
737+
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
738+
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
739+
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
740+
; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm1
739741
; AVX512BW-SLOW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
740742
; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1
741743
; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}

0 commit comments

Comments
 (0)