Skip to content

Commit 5767497

Browse files
committed
Add Extend shuffle pattern to vNf32 shuffles.
There are some cases where its useful for float types, not quite as hot as in the integer case, but still better than alternatives. Differential Revision: https://reviews.llvm.org/D143785
1 parent aca34da commit 5767497

File tree

5 files changed

+137
-75
lines changed

5 files changed

+137
-75
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 90 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -14322,13 +14322,15 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
1432214322
return SDValue();
1432314323
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
1432414324
NumElements / Scale);
14325+
InputV = DAG.getBitcast(VT, InputV);
1432514326
InputV = ShuffleOffset(InputV);
1432614327
InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
1432714328
DL, ExtVT, InputV, DAG);
1432814329
return DAG.getBitcast(VT, InputV);
1432914330
}
1433014331

1433114332
assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
14333+
InputV = DAG.getBitcast(VT, InputV);
1433214334

1433314335
// For any extends we can cheat for larger element sizes and use shuffle
1433414336
// instructions that can fold with a load and/or copy.
@@ -15488,6 +15490,13 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1548815490
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
1548915491
}
1549015492

15493+
if (Subtarget.hasSSE2())
15494+
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
15495+
DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
15496+
ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
15497+
return ZExt;
15498+
}
15499+
1549115500
if (Subtarget.hasAVX2())
1549215501
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
1549315502
return Extract;
@@ -16872,7 +16881,7 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
1687216881
/// AVX vector shuffle types.
1687316882
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
1687416883
SDValue V2, ArrayRef<int> Mask,
16875-
SelectionDAG &DAG) {
16884+
SelectionDAG &DAG, bool SimpleOnly) {
1687616885
assert(VT.getSizeInBits() >= 256 &&
1687716886
"Only for 256-bit or wider vector shuffles!");
1687816887
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
@@ -16900,34 +16909,60 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
1690016909
std::tie(LoV2, HiV2) = SplitVector(V2);
1690116910

1690216911
// Now create two 4-way blends of these half-width vectors.
16903-
auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16904-
bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
16905-
SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16906-
SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16907-
SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16912+
auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
16913+
bool &UseHiV1, bool &UseLoV2,
16914+
bool &UseHiV2) {
16915+
UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
1690816916
for (int i = 0; i < SplitNumElements; ++i) {
1690916917
int M = HalfMask[i];
1691016918
if (M >= NumElements) {
1691116919
if (M >= NumElements + SplitNumElements)
1691216920
UseHiV2 = true;
1691316921
else
1691416922
UseLoV2 = true;
16915-
V2BlendMask[i] = M - NumElements;
16916-
BlendMask[i] = SplitNumElements + i;
1691716923
} else if (M >= 0) {
1691816924
if (M >= SplitNumElements)
1691916925
UseHiV1 = true;
1692016926
else
1692116927
UseLoV1 = true;
16928+
}
16929+
}
16930+
};
16931+
16932+
auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
16933+
if (!SimpleOnly)
16934+
return true;
16935+
16936+
bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
16937+
GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
16938+
16939+
return !(UseHiV1 || UseHiV2);
16940+
};
16941+
16942+
auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16943+
SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16944+
SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16945+
SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16946+
for (int i = 0; i < SplitNumElements; ++i) {
16947+
int M = HalfMask[i];
16948+
if (M >= NumElements) {
16949+
V2BlendMask[i] = M - NumElements;
16950+
BlendMask[i] = SplitNumElements + i;
16951+
} else if (M >= 0) {
1692216952
V1BlendMask[i] = M;
1692316953
BlendMask[i] = i;
1692416954
}
1692516955
}
1692616956

16957+
bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
16958+
GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
16959+
1692716960
// Because the lowering happens after all combining takes place, we need to
1692816961
// manually combine these blend masks as much as possible so that we create
1692916962
// a minimal number of high-level vector shuffle nodes.
1693016963

16964+
assert(!SimpleOnly || (!UseHiV1 && !UseHiV2) && "Shuffle won't be simple");
16965+
1693116966
// First try just blending the halves of V1 or V2.
1693216967
if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
1693316968
return DAG.getUNDEF(SplitVT);
@@ -16938,8 +16973,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
1693816973

1693916974
SDValue V1Blend, V2Blend;
1694016975
if (UseLoV1 && UseHiV1) {
16941-
V1Blend =
16942-
DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16976+
V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
1694316977
} else {
1694416978
// We only use half of V1 so map the usage down into the final blend mask.
1694516979
V1Blend = UseLoV1 ? LoV1 : HiV1;
@@ -16948,8 +16982,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
1694816982
BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
1694916983
}
1695016984
if (UseLoV2 && UseHiV2) {
16951-
V2Blend =
16952-
DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16985+
V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
1695316986
} else {
1695416987
// We only use half of V2 so map the usage down into the final blend mask.
1695516988
V2Blend = UseLoV2 ? LoV2 : HiV2;
@@ -16959,6 +16992,10 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
1695916992
}
1696016993
return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
1696116994
};
16995+
16996+
if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
16997+
return SDValue();
16998+
1696216999
SDValue Lo = HalfBlend(LoMask);
1696317000
SDValue Hi = HalfBlend(HiMask);
1696417001
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
@@ -17015,7 +17052,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
1701517052
if (Mask[i] >= 0)
1701617053
LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
1701717054
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
17018-
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17055+
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17056+
/*SimpleOnly*/ false);
1701917057

1702017058
// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
1702117059
// requires that the decomposed single-input shuffles don't end up here.
@@ -17163,6 +17201,20 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
1716317201
return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
1716417202
}
1716517203

17204+
/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
17205+
static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
17206+
SmallVector<int> &InLaneMask) {
17207+
int Size = Mask.size();
17208+
InLaneMask.assign(Mask.begin(), Mask.end());
17209+
for (int i = 0; i < Size; ++i) {
17210+
int &M = InLaneMask[i];
17211+
if (M < 0)
17212+
continue;
17213+
if (((M % Size) / LaneSize) != (i / LaneSize))
17214+
M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
17215+
}
17216+
}
17217+
1716617218
/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
1716717219
/// source with a lane permutation.
1716817220
///
@@ -17207,21 +17259,17 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
1720717259
assert(V2.isUndef() &&
1720817260
"This last part of this routine only works on single input shuffles");
1720917261

17210-
SmallVector<int, 32> InLaneMask(Mask);
17211-
for (int i = 0; i < Size; ++i) {
17212-
int &M = InLaneMask[i];
17213-
if (M < 0)
17214-
continue;
17215-
if (((M % Size) / LaneSize) != (i / LaneSize))
17216-
M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
17217-
}
17262+
SmallVector<int> InLaneMask;
17263+
computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
17264+
1721817265
assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
1721917266
"In-lane shuffle mask expected");
1722017267

1722117268
// If we're not using both lanes in each lane and the inlane mask is not
1722217269
// repeating, then we're better off splitting.
1722317270
if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
17224-
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17271+
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17272+
/*SimpleOnly*/ false);
1722517273

1722617274
// Flip the lanes, and shuffle the results which should now be in-lane.
1722717275
MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
@@ -18356,6 +18404,19 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1835618404
Subtarget, DAG))
1835718405
return Broadcast;
1835818406

18407+
if (!Subtarget.hasAVX2()) {
18408+
SmallVector<int> InLaneMask;
18409+
computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
18410+
18411+
if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
18412+
if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
18413+
/*SimpleOnly*/ true))
18414+
return R;
18415+
}
18416+
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18417+
Zeroable, Subtarget, DAG))
18418+
return DAG.getBitcast(MVT::v8f32, ZExt);
18419+
1835918420
// If the shuffle mask is repeated in each 128-bit lane, we have many more
1836018421
// options to efficiently lower the shuffle.
1836118422
SmallVector<int, 4> RepeatedMask;
@@ -18848,7 +18909,7 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
1884818909
return V;
1884918910
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
1885018911
return V;
18851-
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18912+
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
1885218913
}
1885318914

1885418915
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
@@ -19087,6 +19148,10 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1908719148
Zeroable, Subtarget, DAG))
1908819149
return Blend;
1908919150

19151+
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19152+
DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19153+
return DAG.getBitcast(MVT::v16f32, ZExt);
19154+
1909019155
// Try to create an in-lane repeating shuffle mask and then shuffle the
1909119156
// results into the target lanes.
1909219157
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -19404,7 +19469,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1940419469
if (Subtarget.hasVBMI())
1940519470
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
1940619471

19407-
return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
19472+
return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
1940819473
}
1940919474

1941019475
/// High-level routine to lower various 512-bit x86 vector shuffles.
@@ -19449,7 +19514,7 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
1944919514
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
1945019515
return V;
1945119516

19452-
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
19517+
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
1945319518
}
1945419519

1945519520
if (VT == MVT::v32f16) {

llvm/test/CodeGen/X86/pr43866.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,9 @@ define dso_local void @test() {
1515
; CHECK-NEXT: subq $64, %rsp
1616
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1717
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
18+
; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,0],xmm0[1,0]
19+
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[0,0]
1820
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
19-
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
20-
; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
21-
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
22-
; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
23-
; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[0,0],ymm1[6,4],ymm0[4,4]
2421
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
2522
; CHECK-NEXT: movq %rbp, %rsp
2623
; CHECK-NEXT: popq %rbp

llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -491,16 +491,27 @@ define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
491491
;
492492
; AVX2-LABEL: shuffle_v8f32_091b2d3f:
493493
; AVX2: # %bb.0:
494-
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
495-
; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
496-
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
494+
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
495+
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
497496
; AVX2-NEXT: retq
498497
;
499-
; AVX512VL-LABEL: shuffle_v8f32_091b2d3f:
500-
; AVX512VL: # %bb.0:
501-
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
502-
; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
503-
; AVX512VL-NEXT: retq
498+
; AVX512VL-SLOW-LABEL: shuffle_v8f32_091b2d3f:
499+
; AVX512VL-SLOW: # %bb.0:
500+
; AVX512VL-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
501+
; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
502+
; AVX512VL-SLOW-NEXT: retq
503+
;
504+
; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_091b2d3f:
505+
; AVX512VL-FAST-ALL: # %bb.0:
506+
; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
507+
; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
508+
; AVX512VL-FAST-ALL-NEXT: retq
509+
;
510+
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_091b2d3f:
511+
; AVX512VL-FAST-PERLANE: # %bb.0:
512+
; AVX512VL-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
513+
; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
514+
; AVX512VL-FAST-PERLANE-NEXT: retq
504515
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
505516
ret <8 x float> %shuffle
506517
}

llvm/test/CodeGen/X86/vector-shuffle-combining.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2820,17 +2820,17 @@ define <4 x float> @PR30264(<4 x float> %x) {
28202820
; SSE2-LABEL: PR30264:
28212821
; SSE2: # %bb.0:
28222822
; SSE2-NEXT: xorps %xmm1, %xmm1
2823-
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2824-
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
2825-
; SSE2-NEXT: movaps %xmm1, %xmm0
2823+
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2824+
; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
2825+
; SSE2-NEXT: movapd %xmm1, %xmm0
28262826
; SSE2-NEXT: retq
28272827
;
28282828
; SSSE3-LABEL: PR30264:
28292829
; SSSE3: # %bb.0:
28302830
; SSSE3-NEXT: xorps %xmm1, %xmm1
2831-
; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2832-
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
2833-
; SSSE3-NEXT: movaps %xmm1, %xmm0
2831+
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2832+
; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
2833+
; SSSE3-NEXT: movapd %xmm1, %xmm0
28342834
; SSSE3-NEXT: retq
28352835
;
28362836
; SSE41-LABEL: PR30264:

llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll

Lines changed: 20 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4677,20 +4677,16 @@ define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
46774677
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
46784678
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
46794679
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4680-
; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
4681-
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
4682-
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
4683-
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
4684-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
4680+
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
4681+
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
4682+
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
46854683
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
4686-
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
4687-
; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
4688-
; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
46894684
; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
4685+
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4686+
; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
4687+
; AVX-NEXT: vmovdqa %xmm2, (%rcx)
4688+
; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
46904689
; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
4691-
; AVX-NEXT: vmovdqa %xmm0, (%rcx)
4692-
; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
4693-
; AVX-NEXT: vzeroupper
46944690
; AVX-NEXT: retq
46954691
;
46964692
; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2:
@@ -7005,26 +7001,19 @@ define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
70057001
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
70067002
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
70077003
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7008-
; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
7009-
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
7010-
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
7011-
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
7012-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
7013-
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
7014-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
7015-
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
7016-
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
7017-
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
7018-
; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2
7019-
; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
7020-
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
7021-
; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
7022-
; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
7023-
; AVX-NEXT: vmovdqa %xmm0, (%rcx)
7024-
; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
7025-
; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
7026-
; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
7027-
; AVX-NEXT: vzeroupper
7004+
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
7005+
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
7006+
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
7007+
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
7008+
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
7009+
; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
7010+
; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3
7011+
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
7012+
; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
7013+
; AVX-NEXT: vmovdqa %xmm2, (%rcx)
7014+
; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
7015+
; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
7016+
; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
70287017
; AVX-NEXT: retq
70297018
;
70307019
; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2:

0 commit comments

Comments
 (0)