Skip to content

Commit 842a672

Browse files
committed
[X86] LowerTRUNCATE - improve handling during type legalization to PACKSS/PACKUS patterns
Extend coverage for lowering wide vector types during type legalization to allow us to use PACKSS/PACKUS patterns instead of dropping down to shuffle lowering. First step towards avoiding premature folds of TRUNCATE to PACKSS/PACKUS nodes as described on Issue #63710 - which causes a large number of regressions on D152928 - we will next need to tweak the TRUNCATE widening in ReplaceNodeResults Differential Revision: https://reviews.llvm.org/D154592
1 parent d21beb5 commit 842a672

16 files changed

+668
-761
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 110 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,7 +1239,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
12391239
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
12401240
setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
12411241
setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1242+
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
12421243
setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1244+
setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1245+
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1246+
setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1247+
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1248+
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1249+
setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1250+
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
12431251

12441252
// In the customized shift lowering, the legal v4i32/v2i64 cases
12451253
// in AVX2 will be recognized.
@@ -1480,9 +1488,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
14801488
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
14811489
}
14821490

1483-
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1484-
setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1485-
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1491+
setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1492+
setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom);
1493+
setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom);
1494+
setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom);
1495+
14861496
setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
14871497

14881498
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
@@ -1802,7 +1812,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
18021812
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
18031813
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
18041814
setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1805-
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
18061815
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
18071816
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
18081817
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
@@ -2338,10 +2347,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
23382347
setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
23392348
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
23402349
}
2341-
2342-
setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2343-
setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2344-
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
23452350
}
23462351

23472352
if (Subtarget.hasAMXTILE()) {
@@ -22869,6 +22874,84 @@ static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL,
2286922874
return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
2287022875
}
2287122876

22877+
/// This function lowers a vector truncation of 'extended sign-bits' or
22878+
/// 'extended zero-bits' values.
22879+
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
22880+
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
22881+
const SDLoc &DL,
22882+
const X86Subtarget &Subtarget,
22883+
SelectionDAG &DAG) {
22884+
MVT SrcVT = In.getSimpleValueType();
22885+
MVT DstSVT = DstVT.getVectorElementType();
22886+
MVT SrcSVT = SrcVT.getVectorElementType();
22887+
if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
22888+
(DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
22889+
return SDValue();
22890+
22891+
unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
22892+
unsigned NumPackedSignBits = std::min<unsigned>(DstSVT.getSizeInBits(), 16);
22893+
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
22894+
22895+
// Truncate with PACKUS if we are truncating a vector with leading zero
22896+
// bits that extend all the way to the packed/truncated value. Pre-SSE41
22897+
// we can only use PACKUSWB.
22898+
KnownBits Known = DAG.computeKnownBits(In);
22899+
if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
22900+
if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG,
22901+
Subtarget))
22902+
return V;
22903+
22904+
// Truncate with PACKSS if we are truncating a vector with sign-bits
22905+
// that extend all the way to the packed/truncated value.
22906+
if ((NumSrcEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
22907+
if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG,
22908+
Subtarget))
22909+
return V;
22910+
22911+
return SDValue();
22912+
}
22913+
22914+
/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
22915+
/// X86ISD::PACKUS/X86ISD::PACKSS operations.
22916+
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL,
22917+
const X86Subtarget &Subtarget,
22918+
SelectionDAG &DAG) {
22919+
MVT SrcVT = In.getSimpleValueType();
22920+
MVT DstSVT = DstVT.getVectorElementType();
22921+
MVT SrcSVT = SrcVT.getVectorElementType();
22922+
unsigned NumElems = DstVT.getVectorNumElements();
22923+
if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
22924+
(DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
22925+
NumElems >= 8))
22926+
return SDValue();
22927+
22928+
// SSSE3's pshufb results in less instructions in the cases below.
22929+
if (Subtarget.hasSSSE3() && NumElems == 8) {
22930+
if (SrcSVT == MVT::i16)
22931+
return SDValue();
22932+
if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
22933+
return SDValue();
22934+
}
22935+
22936+
// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
22937+
// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
22938+
// truncate 2 x v4i32 to v8i16.
22939+
if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
22940+
return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
22941+
22942+
if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
22943+
return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
22944+
22945+
// Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
22946+
if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
22947+
MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
22948+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
22949+
return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
22950+
}
22951+
22952+
return SDValue();
22953+
}
22954+
2287222955
static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
2287322956
const X86Subtarget &Subtarget) {
2287422957

@@ -22955,16 +23038,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
2295523038
MVT VT = Op.getSimpleValueType();
2295623039
SDValue In = Op.getOperand(0);
2295723040
MVT InVT = In.getSimpleValueType();
22958-
unsigned InNumEltBits = InVT.getScalarSizeInBits();
22959-
2296023041
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
2296123042
"Invalid TRUNCATE operation");
2296223043

2296323044
// If we're called by the type legalizer, handle a few cases.
2296423045
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2296523046
if (!TLI.isTypeLegal(InVT)) {
2296623047
if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
22967-
VT.is128BitVector()) {
23048+
VT.is128BitVector() && Subtarget.hasAVX512()) {
2296823049
assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
2296923050
"Unexpected subtarget!");
2297023051
// The default behavior is to truncate one step, concatenate, and then
@@ -22981,35 +23062,28 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
2298123062
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
2298223063
}
2298323064

23065+
// Pre-AVX512 see if we can make use of PACKSS/PACKUS.
23066+
if (!Subtarget.hasAVX512()) {
23067+
if (SDValue SignPack =
23068+
LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
23069+
return SignPack;
23070+
23071+
return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
23072+
}
23073+
2298423074
// Otherwise let default legalization handle it.
2298523075
return SDValue();
2298623076
}
2298723077

2298823078
if (VT.getVectorElementType() == MVT::i1)
2298923079
return LowerTruncateVecI1(Op, DAG, Subtarget);
2299023080

22991-
unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
22992-
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
22993-
2299423081
// Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
2299523082
// concat from subvectors to use VPTRUNC etc.
22996-
if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG)) {
22997-
// Truncate with PACKUS if we are truncating a vector with leading zero
22998-
// bits that extend all the way to the packed/truncated value. Pre-SSE41
22999-
// we can only use PACKUSWB.
23000-
KnownBits Known = DAG.computeKnownBits(In);
23001-
if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
23002-
if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG,
23003-
Subtarget))
23004-
return V;
23005-
23006-
// Truncate with PACKSS if we are truncating a vector with sign-bits
23007-
// that extend all the way to the packed/truncated value.
23008-
if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
23009-
if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG,
23010-
Subtarget))
23011-
return V;
23012-
}
23083+
if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
23084+
if (SDValue SignPack =
23085+
LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
23086+
return SignPack;
2301323087

2301423088
// vpmovqb/w/d, vpmovdb/w, vpmovwb
2301523089
if (Subtarget.hasAVX512()) {
@@ -23068,27 +23142,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
2306823142
return DAG.getBitcast(MVT::v8i16, In);
2306923143
}
2307023144

23071-
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
23072-
DAG.getIntPtrConstant(0, DL));
23073-
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
23074-
DAG.getIntPtrConstant(4, DL));
23075-
23076-
// The PSHUFB mask:
23077-
static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};
23078-
23079-
OpLo = DAG.getBitcast(MVT::v8i16, OpLo);
23080-
OpHi = DAG.getBitcast(MVT::v8i16, OpHi);
23081-
23082-
OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);
23083-
OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);
23084-
23085-
OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
23086-
OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
23087-
23088-
// The MOVLHPS Mask:
23089-
static const int ShufMask2[] = {0, 1, 4, 5};
23090-
SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
23091-
return DAG.getBitcast(MVT::v8i16, res);
23145+
return Subtarget.hasSSE41()
23146+
? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
23147+
: truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
2309223148
}
2309323149

2309423150
if (VT == MVT::v16i8 && InVT == MVT::v16i16)
@@ -53152,6 +53208,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
5315253208
/// legalization the truncation will be translated into a BUILD_VECTOR with each
5315353209
/// element that is extracted from a vector and then truncated, and it is
5315453210
/// difficult to do this optimization based on them.
53211+
/// TODO: Remove this and just use LowerTruncateVecPack.
5315553212
static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
5315653213
const X86Subtarget &Subtarget) {
5315753214
EVT OutVT = N->getValueType(0);
@@ -53200,6 +53257,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
5320053257
/// This function transforms vector truncation of 'extended sign-bits' or
5320153258
/// 'extended zero-bits' values.
5320253259
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
53260+
/// TODO: Remove this and just use LowerTruncateVecPackWithSignBits.
5320353261
static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
5320453262
SelectionDAG &DAG,
5320553263
const X86Subtarget &Subtarget) {

llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -451,13 +451,13 @@ define i8 @v8i32_or_and(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d,
451451
define i8 @v8i32_or_vselect(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
452452
; SSE2-SSSE3-LABEL: v8i32_or_vselect:
453453
; SSE2-SSSE3: # %bb.0:
454-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1
455454
; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
455+
; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1
456+
; SSE2-SSSE3-NEXT: por %xmm5, %xmm1
457+
; SSE2-SSSE3-NEXT: por %xmm4, %xmm0
456458
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
457-
; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm4
458-
; SSE2-SSSE3-NEXT: por %xmm0, %xmm4
459-
; SSE2-SSSE3-NEXT: packsswb %xmm4, %xmm4
460-
; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax
459+
; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
460+
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
461461
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
462462
; SSE2-SSSE3-NEXT: retq
463463
;
@@ -514,10 +514,8 @@ define i8 @v8i32_or_vselect(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
514514
define i8 @v8i32_or_select(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3, i1 %a4) {
515515
; SSE2-SSSE3-LABEL: v8i32_or_select:
516516
; SSE2-SSSE3: # %bb.0:
517-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
518517
; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
519-
; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
520-
; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm6
518+
; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
521519
; SSE2-SSSE3-NEXT: testb $1, %dil
522520
; SSE2-SSSE3-NEXT: jne .LBB7_1
523521
; SSE2-SSSE3-NEXT: # %bb.2:
@@ -528,7 +526,9 @@ define i8 @v8i32_or_select(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32
528526
; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm0
529527
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
530528
; SSE2-SSSE3-NEXT: .LBB7_3:
529+
; SSE2-SSSE3-NEXT: por %xmm3, %xmm7
531530
; SSE2-SSSE3-NEXT: por %xmm2, %xmm6
531+
; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm6
532532
; SSE2-SSSE3-NEXT: por %xmm0, %xmm6
533533
; SSE2-SSSE3-NEXT: packsswb %xmm6, %xmm6
534534
; SSE2-SSSE3-NEXT: pmovmskb %xmm6, %eax

llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll

Lines changed: 20 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -356,21 +356,18 @@ define i16 @v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) {
356356
;
357357
; AVX2-LABEL: v16i32:
358358
; AVX2: # %bb.0:
359-
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
360-
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
361359
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
362-
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
363-
; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4
364-
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
365-
; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2
366-
; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm5
367-
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
368-
; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3
369-
; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1
370-
; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
371-
; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
372-
; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
360+
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
361+
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
362+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
363+
; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm2
364+
; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm3
365+
; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2
366+
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
367+
; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
368+
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
373369
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
370+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
374371
; AVX2-NEXT: vpmovmskb %xmm0, %eax
375372
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
376373
; AVX2-NEXT: vzeroupper
@@ -450,21 +447,18 @@ define i16 @v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x floa
450447
;
451448
; AVX2-LABEL: v16f32:
452449
; AVX2: # %bb.0:
453-
; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
454-
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
455450
; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
456-
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3
457-
; AVX2-NEXT: vcmpltps %ymm4, %ymm6, %ymm4
458-
; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm6
459-
; AVX2-NEXT: vandps %xmm6, %xmm2, %xmm2
460-
; AVX2-NEXT: vcmpltps %ymm5, %ymm7, %ymm5
461-
; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm6
462-
; AVX2-NEXT: vandps %xmm6, %xmm3, %xmm3
463-
; AVX2-NEXT: vandps %xmm5, %xmm1, %xmm1
464-
; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
465-
; AVX2-NEXT: vandps %xmm4, %xmm0, %xmm0
466-
; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
451+
; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
452+
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
453+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
454+
; AVX2-NEXT: vcmpltps %ymm5, %ymm7, %ymm2
455+
; AVX2-NEXT: vcmpltps %ymm4, %ymm6, %ymm3
456+
; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2
457+
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
458+
; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
459+
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
467460
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
461+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
468462
; AVX2-NEXT: vpmovmskb %xmm0, %eax
469463
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
470464
; AVX2-NEXT: vzeroupper

0 commit comments

Comments
 (0)