Skip to content

Commit 0bd098b

Browse files
authored
[X86] Fold VPERMV3(WIDEN(X),M,WIDEN(Y)) -> VPERMV(CONCAT(X,Y),M') iff the CONCAT is free (#122750)
Minor followup to #122485 - if the source operands were widened half-size subvectors, then attempt to concatenate the subvectors directly, and then adjust the shuffle mask so references to the second operand now refer to the upper half of the concat result.
1 parent 6c7a53b commit 0bd098b

File tree

3 files changed

+64
-102
lines changed

3 files changed

+64
-102
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41711,9 +41711,10 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4171141711
SelectionDAG &DAG,
4171241712
TargetLowering::DAGCombinerInfo &DCI,
4171341713
const X86Subtarget &Subtarget) {
41714+
using namespace SDPatternMatch;
41715+
4171441716
MVT VT = N.getSimpleValueType();
4171541717
unsigned NumElts = VT.getVectorNumElements();
41716-
4171741718
SmallVector<int, 4> Mask;
4171841719
unsigned Opcode = N.getOpcode();
4171941720
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -42436,6 +42437,24 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4243642437
/*IsMask=*/true);
4243742438
return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, N.getOperand(0));
4243842439
}
42440+
// If sources are half width, then concat and use VPERMV with adjusted
42441+
// mask.
42442+
SDValue Ops[2];
42443+
MVT HalfVT = VT.getHalfNumVectorElementsVT();
42444+
if (sd_match(V1,
42445+
m_InsertSubvector(m_Undef(), m_Value(Ops[0]), m_Zero())) &&
42446+
sd_match(V2,
42447+
m_InsertSubvector(m_Undef(), m_Value(Ops[1]), m_Zero())) &&
42448+
Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
42449+
if (SDValue ConcatSrc =
42450+
combineConcatVectorOps(DL, VT, Ops, DAG, DCI, Subtarget)) {
42451+
for (int &M : Mask)
42452+
M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
42453+
SDValue NewMask = getConstVector(Mask, MaskVT, DAG, DL,
42454+
/*IsMask=*/true);
42455+
return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, ConcatSrc);
42456+
}
42457+
}
4243942458
// Commute foldable source to the RHS.
4244042459
if (isShuffleFoldableLoad(N.getOperand(0)) &&
4244142460
!isShuffleFoldableLoad(N.getOperand(2))) {

llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll

Lines changed: 12 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
33
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-ALL
44
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-PERLANE
5-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-FAST-ALL
6-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-FAST-PERLANE
7-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-ALL
8-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-PERLANE
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
6+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
7+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
8+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
99
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
1010
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
11-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
12-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
11+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
12+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
1313

1414
; PR31551
1515
; Pairs of shufflevector:trunc functions with functional equivalence.
@@ -74,13 +74,6 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind {
7474
; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
7575
; AVX512VBMI-NEXT: vzeroupper
7676
; AVX512VBMI-NEXT: retq
77-
;
78-
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
79-
; AVX512VBMIVL: # %bb.0:
80-
; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
81-
; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
82-
; AVX512VBMIVL-NEXT: vzeroupper
83-
; AVX512VBMIVL-NEXT: retq
8477
%vec = load <64 x i8>, ptr %L
8578
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
8679
store <32 x i8> %strided.vec, ptr %S
@@ -126,13 +119,6 @@ define void @trunc_v32i16_to_v32i8(ptr %L, ptr %S) nounwind {
126119
; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
127120
; AVX512VBMI-NEXT: vzeroupper
128121
; AVX512VBMI-NEXT: retq
129-
;
130-
; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
131-
; AVX512VBMIVL: # %bb.0:
132-
; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
133-
; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
134-
; AVX512VBMIVL-NEXT: vzeroupper
135-
; AVX512VBMIVL-NEXT: retq
136122
%vec = load <64 x i8>, ptr %L
137123
%bc = bitcast <64 x i8> %vec to <32 x i16>
138124
%strided.vec = trunc <32 x i16> %bc to <32 x i8>
@@ -346,14 +332,6 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
346332
; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
347333
; AVX512VBMI-NEXT: vzeroupper
348334
; AVX512VBMI-NEXT: retq
349-
;
350-
; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
351-
; AVX512VBMIVL: # %bb.0:
352-
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
353-
; AVX512VBMIVL-NEXT: vpermb %zmm0, %zmm1, %zmm0
354-
; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
355-
; AVX512VBMIVL-NEXT: vzeroupper
356-
; AVX512VBMIVL-NEXT: retq
357335
%res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
358336
ret <16 x i8> %res
359337
}
@@ -406,12 +384,6 @@ define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) {
406384
; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0
407385
; AVX512VBMI-NEXT: vpmovwb %zmm0, %ymm0
408386
; AVX512VBMI-NEXT: retq
409-
;
410-
; AVX512VBMIVL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
411-
; AVX512VBMIVL: # %bb.0:
412-
; AVX512VBMIVL-NEXT: vpsrlw $8, %zmm0, %zmm0
413-
; AVX512VBMIVL-NEXT: vpmovwb %zmm0, %ymm0
414-
; AVX512VBMIVL-NEXT: retq
415387
%bc = bitcast <32 x i16> %a0 to <64 x i8>
416388
%res = shufflevector <64 x i8> %bc, <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
417389
ret <32 x i8> %res
@@ -442,11 +414,9 @@ define <4 x double> @PR34175(ptr %p) {
442414
;
443415
; AVX512BW-LABEL: PR34175:
444416
; AVX512BW: # %bb.0:
445-
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,32,40,0,0,0,0]
446-
; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1
447-
; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm2
448-
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
449-
; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
417+
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
418+
; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0
419+
; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
450420
; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
451421
; AVX512BW-NEXT: retq
452422
;
@@ -460,21 +430,11 @@ define <4 x double> @PR34175(ptr %p) {
460430
;
461431
; AVX512VBMI-LABEL: PR34175:
462432
; AVX512VBMI: # %bb.0:
463-
; AVX512VBMI-NEXT: vmovq {{.*#+}} xmm0 = [0,8,32,40,0,0,0,0]
464-
; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
465-
; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %ymm2
466-
; AVX512VBMI-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
467-
; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
433+
; AVX512VBMI-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
434+
; AVX512VBMI-NEXT: vpermw (%rdi), %zmm0, %zmm0
435+
; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
468436
; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
469437
; AVX512VBMI-NEXT: retq
470-
;
471-
; AVX512VBMIVL-LABEL: PR34175:
472-
; AVX512VBMIVL: # %bb.0:
473-
; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
474-
; AVX512VBMIVL-NEXT: vpermw (%rdi), %zmm0, %zmm0
475-
; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
476-
; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
477-
; AVX512VBMIVL-NEXT: retq
478438
%v = load <32 x i16>, ptr %p, align 2
479439
%shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
480440
%tofp = uitofp <4 x i16> %shuf to <4 x double>
@@ -492,8 +452,3 @@ define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
492452
ret <16 x i8> %result
493453
}
494454

495-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
496-
; AVX512BW-FAST-ALL: {{.*}}
497-
; AVX512BW-FAST-PERLANE: {{.*}}
498-
; AVX512BWVL-FAST-ALL: {{.*}}
499-
; AVX512BWVL-FAST-PERLANE: {{.*}}

llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll

Lines changed: 32 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -339,17 +339,14 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
339339
; AVX512-FCP: # %bb.0:
340340
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
341341
; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1
342-
; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2
343-
; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm3
344-
; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
345-
; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
346-
; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
347-
; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
348-
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
349-
; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
350-
; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
351-
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
352-
; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
342+
; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
343+
; AVX512-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0
344+
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13]
345+
; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
346+
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15]
347+
; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
348+
; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8)
349+
; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
353350
; AVX512-FCP-NEXT: vzeroupper
354351
; AVX512-FCP-NEXT: retq
355352
;
@@ -378,17 +375,14 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
378375
; AVX512DQ-FCP: # %bb.0:
379376
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
380377
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1
381-
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2
382-
; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3
383-
; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
384-
; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
385-
; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
386-
; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
387-
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
388-
; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
389-
; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
390-
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
391-
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
378+
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
379+
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0
380+
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13]
381+
; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
382+
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15]
383+
; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
384+
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8)
385+
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
392386
; AVX512DQ-FCP-NEXT: vzeroupper
393387
; AVX512DQ-FCP-NEXT: retq
394388
;
@@ -417,17 +411,14 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
417411
; AVX512BW-FCP: # %bb.0:
418412
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
419413
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm1
420-
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm2
421-
; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm3
422-
; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
423-
; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
424-
; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
425-
; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
426-
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
427-
; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
428-
; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
429-
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
430-
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
414+
; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
415+
; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0
416+
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13]
417+
; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
418+
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15]
419+
; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
420+
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8)
421+
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
431422
; AVX512BW-FCP-NEXT: vzeroupper
432423
; AVX512BW-FCP-NEXT: retq
433424
;
@@ -456,17 +447,14 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
456447
; AVX512DQ-BW-FCP: # %bb.0:
457448
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
458449
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm1
459-
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm2
460-
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm3
461-
; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
462-
; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
463-
; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
464-
; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
465-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
466-
; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
467-
; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
468-
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
469-
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
450+
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
451+
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0
452+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13]
453+
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
454+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15]
455+
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
456+
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8)
457+
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
470458
; AVX512DQ-BW-FCP-NEXT: vzeroupper
471459
; AVX512DQ-BW-FCP-NEXT: retq
472460
%in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 64

0 commit comments

Comments
 (0)