Skip to content

Commit ddd2f57

Browse files
authored
[X86] Use NSW/NUW flags on ISD::TRUNCATE nodes to improve X86 PACKSS/PACKUS lowering (#123956)
If the NSW/NUW flags are present, then we can assume the source value is within bounds and saturation will not occur with the PACKSS/PACKUS instructions. Fixes #87485
1 parent f6253f8 commit ddd2f57

File tree

2 files changed

+219
-837
lines changed

2 files changed

+219
-837
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20822,7 +20822,8 @@ static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL,
2082220822
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
2082320823
SDValue In, const SDLoc &DL,
2082420824
SelectionDAG &DAG,
20825-
const X86Subtarget &Subtarget) {
20825+
const X86Subtarget &Subtarget,
20826+
const SDNodeFlags Flags = SDNodeFlags()) {
2082620827
// Requires SSE2.
2082720828
if (!Subtarget.hasSSE2())
2082820829
return SDValue();
@@ -20868,7 +20869,8 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
2086820869
// e.g. Masks, zext_in_reg, etc.
2086920870
// Pre-SSE41 we can only use PACKUSWB.
2087020871
KnownBits Known = DAG.computeKnownBits(In);
20871-
if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20872+
if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
20873+
(NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
2087220874
PackOpcode = X86ISD::PACKUS;
2087320875
return In;
2087420876
}
@@ -20887,7 +20889,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
2088720889
return SDValue();
2088820890

2088920891
unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20890-
if (MinSignBits < NumSignBits) {
20892+
if (Flags.hasNoSignedWrap() || MinSignBits < NumSignBits) {
2089120893
PackOpcode = X86ISD::PACKSS;
2089220894
return In;
2089320895
}
@@ -20909,10 +20911,9 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
2090920911
/// This function lowers a vector truncation of 'extended sign-bits' or
2091020912
/// 'extended zero-bits' values.
2091120913
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20912-
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
20913-
const SDLoc &DL,
20914-
const X86Subtarget &Subtarget,
20915-
SelectionDAG &DAG) {
20914+
static SDValue LowerTruncateVecPackWithSignBits(
20915+
MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
20916+
SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
2091620917
MVT SrcVT = In.getSimpleValueType();
2091720918
MVT DstSVT = DstVT.getVectorElementType();
2091820919
MVT SrcSVT = SrcVT.getVectorElementType();
@@ -20934,8 +20935,8 @@ static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
2093420935
}
2093520936

2093620937
unsigned PackOpcode;
20937-
if (SDValue Src =
20938-
matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20938+
if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
20939+
Subtarget, Flags))
2093920940
return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
2094020941

2094120942
return SDValue();
@@ -21105,8 +21106,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
2110521106
// Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
2110621107
if (!Subtarget.hasAVX512() ||
2110721108
(InVT.is512BitVector() && VT.is256BitVector()))
21108-
if (SDValue SignPack =
21109-
LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
21109+
if (SDValue SignPack = LowerTruncateVecPackWithSignBits(
21110+
VT, In, DL, Subtarget, DAG, Op->getFlags()))
2111021111
return SignPack;
2111121112

2111221113
// Pre-AVX512 see if we can make use of PACKSS/PACKUS.
@@ -21123,8 +21124,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
2112321124
// Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
2112421125
// concat from subvectors to use VPTRUNC etc.
2112521126
if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
21126-
if (SDValue SignPack =
21127-
LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
21127+
if (SDValue SignPack = LowerTruncateVecPackWithSignBits(
21128+
VT, In, DL, Subtarget, DAG, Op->getFlags()))
2112821129
return SignPack;
2112921130

2113021131
// vpmovqb/w/d, vpmovdb/w, vpmovwb
@@ -33594,10 +33595,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
3359433595

3359533596
// See if there are sufficient leading bits to perform a PACKUS/PACKSS.
3359633597
unsigned PackOpcode;
33597-
if (SDValue Src =
33598-
matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
33599-
if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
33600-
dl, DAG, Subtarget)) {
33598+
if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
33599+
Subtarget, N->getFlags())) {
33600+
if (SDValue Res =
33601+
truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
3360133602
Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
3360233603
Results.push_back(Res);
3360333604
return;

0 commit comments

Comments
 (0)