Skip to content

Commit 6bc3c9e

Browse files
committed
[X86] combineX86ShuffleChain - always create VPERMV3 nodes if started from a VPERMV3 node
If the root shuffle node was a VPERMV3 node, then we can always replace it with a new VPERMV3 node - it doesn't matter if other variable shuffles in the chain had multiple uses.
1 parent 140680c commit 6bc3c9e

File tree

5 files changed

+226
-224
lines changed

5 files changed

+226
-224
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39835,6 +39835,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3983539835
bool AllowBWIVPERMV3 =
3983639836
(Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
3983739837

39838+
// If root was a VPERMV3 node, always allow a variable shuffle.
39839+
if (Root.getOpcode() == X86ISD::VPERMV3)
39840+
AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
39841+
3983839842
bool MaskContainsZeros = isAnyZero(Mask);
3983939843

3984039844
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
123123
; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
124124
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
125125
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
126-
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9]
126+
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
127+
; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
128+
; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
127129
; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
128-
; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
129-
; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
130-
; AVX512-NEXT: vmovdqa64 %zmm3, (%r9)
131-
; AVX512-NEXT: vmovdqa %xmm1, 64(%r9)
130+
; AVX512-NEXT: vmovdqa %xmm3, 64(%r9)
131+
; AVX512-NEXT: vmovdqa64 %zmm1, (%r9)
132132
; AVX512-NEXT: vzeroupper
133133
; AVX512-NEXT: retq
134134
;
@@ -140,12 +140,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
140140
; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
141141
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
142142
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
143-
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9]
143+
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
144+
; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
145+
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
144146
; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
145-
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
146-
; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
147-
; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9)
148-
; AVX512-FCP-NEXT: vmovdqa %xmm1, 64(%r9)
147+
; AVX512-FCP-NEXT: vmovdqa %xmm3, 64(%r9)
148+
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
149149
; AVX512-FCP-NEXT: vzeroupper
150150
; AVX512-FCP-NEXT: retq
151151
;
@@ -157,12 +157,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
157157
; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
158158
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
159159
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
160-
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9]
160+
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
161+
; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
162+
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
161163
; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
162-
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
163-
; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
164-
; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%r9)
165-
; AVX512DQ-NEXT: vmovdqa %xmm1, 64(%r9)
164+
; AVX512DQ-NEXT: vmovdqa %xmm3, 64(%r9)
165+
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9)
166166
; AVX512DQ-NEXT: vzeroupper
167167
; AVX512DQ-NEXT: retq
168168
;
@@ -174,12 +174,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
174174
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
175175
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
176176
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
177-
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9]
177+
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
178+
; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
179+
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
178180
; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
179-
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
180-
; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
181-
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9)
182-
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 64(%r9)
181+
; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 64(%r9)
182+
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
183183
; AVX512DQ-FCP-NEXT: vzeroupper
184184
; AVX512DQ-FCP-NEXT: retq
185185
;
@@ -191,12 +191,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
191191
; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
192192
; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
193193
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
194-
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9]
194+
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
195+
; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
196+
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
195197
; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
196-
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
197-
; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
198-
; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9)
199-
; AVX512BW-NEXT: vmovdqa %xmm1, 64(%r9)
198+
; AVX512BW-NEXT: vmovdqa %xmm3, 64(%r9)
199+
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r9)
200200
; AVX512BW-NEXT: vzeroupper
201201
; AVX512BW-NEXT: retq
202202
;
@@ -208,12 +208,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
208208
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
209209
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
210210
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
211-
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9]
211+
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
212+
; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
213+
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
212214
; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
213-
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
214-
; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
215-
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9)
216-
; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9)
215+
; AVX512BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9)
216+
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
217217
; AVX512BW-FCP-NEXT: vzeroupper
218218
; AVX512BW-FCP-NEXT: retq
219219
;
@@ -225,12 +225,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
225225
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
226226
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
227227
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
228-
; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9]
228+
; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
229+
; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
230+
; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
229231
; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
230-
; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
231-
; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
232-
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9)
233-
; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 64(%r9)
232+
; AVX512DQ-BW-NEXT: vmovdqa %xmm3, 64(%r9)
233+
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%r9)
234234
; AVX512DQ-BW-NEXT: vzeroupper
235235
; AVX512DQ-BW-NEXT: retq
236236
;
@@ -242,12 +242,12 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
242242
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
243243
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
244244
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
245-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9]
245+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
246+
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
247+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
246248
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
247-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5]
248-
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
249-
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9)
250-
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9)
249+
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9)
250+
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
251251
; AVX512DQ-BW-FCP-NEXT: vzeroupper
252252
; AVX512DQ-BW-FCP-NEXT: retq
253253
%in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64

llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll

Lines changed: 56 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
138138
; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
139139
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
140140
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
141-
; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1
142-
; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
143-
; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
144-
; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
145-
; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
146-
; AVX512-NEXT: vmovdqa64 %zmm3, (%rax)
147-
; AVX512-NEXT: vmovdqa %ymm2, 64(%rax)
141+
; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
142+
; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
143+
; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
144+
; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
145+
; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
146+
; AVX512-NEXT: vmovdqa %ymm3, 64(%rax)
147+
; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
148148
; AVX512-NEXT: vzeroupper
149149
; AVX512-NEXT: retq
150150
;
@@ -157,13 +157,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
157157
; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
158158
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
159159
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
160-
; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1
161-
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
162-
; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
163-
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
164-
; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
165-
; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
166-
; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
160+
; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
161+
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
162+
; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
163+
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
164+
; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
165+
; AVX512-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
166+
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
167167
; AVX512-FCP-NEXT: vzeroupper
168168
; AVX512-FCP-NEXT: retq
169169
;
@@ -176,13 +176,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
176176
; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
177177
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
178178
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
179-
; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1
180-
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
181-
; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
182-
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
183-
; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
184-
; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax)
185-
; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rax)
179+
; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
180+
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
181+
; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
182+
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
183+
; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
184+
; AVX512DQ-NEXT: vmovdqa %ymm3, 64(%rax)
185+
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
186186
; AVX512DQ-NEXT: vzeroupper
187187
; AVX512DQ-NEXT: retq
188188
;
@@ -195,13 +195,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
195195
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
196196
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
197197
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
198-
; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1
199-
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
200-
; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
201-
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
202-
; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
203-
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
204-
; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
198+
; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
199+
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
200+
; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
201+
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
202+
; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
203+
; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
204+
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
205205
; AVX512DQ-FCP-NEXT: vzeroupper
206206
; AVX512DQ-FCP-NEXT: retq
207207
;
@@ -214,13 +214,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
214214
; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
215215
; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
216216
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
217-
; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1
218-
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
219-
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
220-
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
221-
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
222-
; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax)
223-
; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rax)
217+
; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
218+
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
219+
; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
220+
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
221+
; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
222+
; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax)
223+
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
224224
; AVX512BW-NEXT: vzeroupper
225225
; AVX512BW-NEXT: retq
226226
;
@@ -233,13 +233,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
233233
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
234234
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
235235
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
236-
; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1
237-
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
238-
; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
239-
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
240-
; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
241-
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
242-
; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
236+
; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
237+
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
238+
; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
239+
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
240+
; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
241+
; AVX512BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
242+
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
243243
; AVX512BW-FCP-NEXT: vzeroupper
244244
; AVX512BW-FCP-NEXT: retq
245245
;
@@ -252,13 +252,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
252252
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
253253
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
254254
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
255-
; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1
256-
; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
257-
; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
258-
; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
259-
; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
260-
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax)
261-
; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rax)
255+
; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
256+
; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
257+
; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
258+
; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
259+
; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
260+
; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax)
261+
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
262262
; AVX512DQ-BW-NEXT: vzeroupper
263263
; AVX512DQ-BW-NEXT: retq
264264
;
@@ -271,13 +271,13 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
271271
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
272272
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
273273
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
274-
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1
275-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11]
276-
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
277-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
278-
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
279-
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
280-
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
274+
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
275+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
276+
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
277+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
278+
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
279+
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
280+
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
281281
; AVX512DQ-BW-FCP-NEXT: vzeroupper
282282
; AVX512DQ-BW-FCP-NEXT: retq
283283
%in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64

0 commit comments

Comments
 (0)