Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit c37c34b

Browse files
committed
[X86] Simplify some code in lowerV4X128VectorShuffle and lowerV2X128VectorShuffle
Previously we extracted two subvectors and concatenate. But the concatenate will be lowered to two insert subvectors. Then DAG combine will merge once of the inserts and one of the extracts back into the original vector. We might as well just directly use one extract and one insert. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@324710 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 5856e09 commit c37c34b

File tree

2 files changed

+60
-59
lines changed

2 files changed

+60
-59
lines changed

lib/Target/X86/X86ISelLowering.cpp

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12761,12 +12761,11 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
1276112761
// this will likely become vinsertf128 which can't fold a 256-bit memop.
1276212762
if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
1276312763
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
12764-
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12765-
DAG.getIntPtrConstant(0, DL));
12766-
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12767-
OnlyUsesV1 ? V1 : V2,
12768-
DAG.getIntPtrConstant(0, DL));
12769-
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12764+
SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12765+
OnlyUsesV1 ? V1 : V2,
12766+
DAG.getIntPtrConstant(0, DL));
12767+
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
12768+
DAG.getIntPtrConstant(2, DL));
1277012769
}
1277112770
}
1277212771

@@ -13930,12 +13929,11 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
1393013929
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
1393113930
{0, 1, 2, 3, 8, 9, 10, 11})) {
1393213931
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13933-
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13934-
DAG.getIntPtrConstant(0, DL));
13935-
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13936-
OnlyUsesV1 ? V1 : V2,
13932+
SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13933+
OnlyUsesV1 ? V1 : V2,
1393713934
DAG.getIntPtrConstant(0, DL));
13938-
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13935+
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
13936+
DAG.getIntPtrConstant(4, DL));
1393913937
}
1394013938

1394113939
assert(WidenedMask.size() == 4);

test/CodeGen/X86/x86-interleaved-access.ll

Lines changed: 51 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1722,11 +1722,10 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
17221722
; AVX1-NEXT: subq $24, %rsp
17231723
; AVX1-NEXT: .cfi_def_cfa_offset 32
17241724
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1725-
; AVX1-NEXT: vmovdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
17261725
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11
17271726
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm12
1728-
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1729-
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1727+
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1728+
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
17301729
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13
17311730
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14
17321731
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
@@ -1736,66 +1735,70 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
17361735
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
17371736
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
17381737
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
1739-
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
1738+
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
1739+
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
17401740
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
17411741
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1
17421742
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
1743-
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1744-
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
1745-
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
1746-
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
1743+
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1744+
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
1745+
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
1746+
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
1747+
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
1748+
; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
17471749
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
17481750
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
17491751
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4
17501752
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1751-
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1753+
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
17521754
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
1753-
; AVX1-NEXT: vmovdqa %xmm8, %xmm1
1754-
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
1755-
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm0
1756-
; AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) # 32-byte Spill
1757-
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
1758-
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1759-
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
1760-
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm11
1761-
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
1762-
; AVX1-NEXT: vmovdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
1763-
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1764-
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
1765-
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
1766-
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
1767-
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1768-
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
1769-
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5
1770-
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
1771-
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
1772-
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
1773-
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
1774-
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
1755+
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
1756+
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm1
1757+
; AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) # 32-byte Spill
1758+
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
1759+
; AVX1-NEXT: vmovdqa %xmm8, %xmm2
1760+
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
1761+
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm13
1762+
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
1763+
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
1764+
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
1765+
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
1766+
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
1767+
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
1768+
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
1769+
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
1770+
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm14
17751771
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
1776-
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
1777-
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
1778-
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm2
1772+
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
1773+
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1774+
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
1775+
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1776+
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1777+
; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9
1778+
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
1779+
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
1780+
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1781+
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1782+
; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm0
1783+
; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8
1784+
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm0[2,3]
17791785
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
1780-
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
1781-
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm2
1782-
; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm8, %ymm6
1783-
; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8
1784-
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm6[2,3]
1785-
; AVX1-NEXT: vinsertf128 $1, -{{[0-9]+}}(%rsp), %ymm4, %ymm6 # 16-byte Folded Reload
1786+
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6
1787+
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
1788+
; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm10, %ymm2
17861789
; AVX1-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm3 # 32-byte Reload
1787-
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm0
1788-
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3],ymm6[2,3]
1789-
; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm3
1790-
; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm7
1791-
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
1792-
; AVX1-NEXT: vmovaps %ymm2, 32(%rdi)
1790+
; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm0
1791+
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
1792+
; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm3
1793+
; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm14, %ymm7
1794+
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
17931795
; AVX1-NEXT: vmovaps %ymm3, 224(%rdi)
1794-
; AVX1-NEXT: vmovaps %ymm6, 192(%rdi)
1796+
; AVX1-NEXT: vmovaps %ymm2, 192(%rdi)
17951797
; AVX1-NEXT: vmovaps %ymm7, 160(%rdi)
17961798
; AVX1-NEXT: vmovaps %ymm0, 128(%rdi)
17971799
; AVX1-NEXT: vmovaps %ymm1, 96(%rdi)
1798-
; AVX1-NEXT: vmovaps %ymm9, 64(%rdi)
1800+
; AVX1-NEXT: vmovaps %ymm5, 64(%rdi)
1801+
; AVX1-NEXT: vmovaps %ymm6, 32(%rdi)
17991802
; AVX1-NEXT: vmovaps %ymm8, (%rdi)
18001803
; AVX1-NEXT: addq $24, %rsp
18011804
; AVX1-NEXT: vzeroupper

0 commit comments

Comments
 (0)