Skip to content

Commit 2ed914c

Browse files
committed
[X86][SSE] getFauxShuffleMask - handle PACKSS(SRAI(),SRAI()) shuffle patterns.
We can't easily treat ASHR a faux shuffle, but if it was just feeding a PACKSS then it was likely being used as sign-extension for a truncation, so just peek through and adjust the mask accordingly.
1 parent 7e44208 commit 2ed914c

File tree

2 files changed

+63
-72
lines changed

2 files changed

+63
-72
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7685,12 +7685,26 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
76857685

76867686
// If we know input saturation won't happen (or we don't care for particular
76877687
// lanes), we can treat this as a truncation shuffle.
7688+
bool Offset0 = false, Offset1 = false;
76887689
if (Opcode == X86ISD::PACKSS) {
76897690
if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
76907691
DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
76917692
(!(N1.isUndef() || EltsRHS.isNullValue()) &&
76927693
DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
76937694
return false;
7695+
// We can't easily fold ASHR into a shuffle, but if it was feeding a
7696+
// PACKSS then it was likely being used for sign-extension for a
7697+
// truncation, so just peek through and adjust the mask accordingly.
7698+
if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
7699+
N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
7700+
Offset0 = true;
7701+
N0 = N0.getOperand(0);
7702+
}
7703+
if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
7704+
N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
7705+
Offset1 = true;
7706+
N1 = N1.getOperand(0);
7707+
}
76947708
} else {
76957709
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
76967710
if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
@@ -7707,6 +7721,13 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
77077721
Ops.push_back(N1);
77087722

77097723
createPackShuffleMask(VT, Mask, IsUnary);
7724+
7725+
if (Offset0 || Offset1) {
7726+
for (int &M : Mask)
7727+
if ((Offset0 && isInRange(M, 0, NumElts)) ||
7728+
(Offset1 && isInRange(M, NumElts, 2 * NumElts)))
7729+
++M;
7730+
}
77107731
return true;
77117732
}
77127733
case X86ISD::VTRUNC: {

llvm/test/CodeGen/X86/psubus.ll

Lines changed: 42 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,11 +1403,6 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
14031403
; SSE2-NEXT: psrad $16, %xmm5
14041404
; SSE2-NEXT: packssdw %xmm6, %xmm5
14051405
; SSE2-NEXT: psubusw %xmm5, %xmm0
1406-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1407-
; SSE2-NEXT: psrad $16, %xmm1
1408-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1409-
; SSE2-NEXT: psrad $16, %xmm0
1410-
; SSE2-NEXT: packssdw %xmm1, %xmm0
14111406
; SSE2-NEXT: retq
14121407
;
14131408
; SSSE3-LABEL: psubus_8i32_max:
@@ -1738,111 +1733,91 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
17381733
; SSE2-LABEL: psubus_16i32_max:
17391734
; SSE2: # %bb.0: # %vector.ph
17401735
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
1741-
; SSE2-NEXT: movdqa %xmm3, %xmm8
1736+
; SSE2-NEXT: movdqa %xmm5, %xmm8
17421737
; SSE2-NEXT: pxor %xmm9, %xmm8
17431738
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183]
17441739
; SSE2-NEXT: movdqa %xmm7, %xmm6
17451740
; SSE2-NEXT: pcmpgtd %xmm8, %xmm6
17461741
; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
1747-
; SSE2-NEXT: pand %xmm6, %xmm3
1742+
; SSE2-NEXT: pand %xmm6, %xmm5
17481743
; SSE2-NEXT: pxor %xmm8, %xmm6
1749-
; SSE2-NEXT: por %xmm3, %xmm6
1744+
; SSE2-NEXT: por %xmm5, %xmm6
17501745
; SSE2-NEXT: pslld $16, %xmm6
17511746
; SSE2-NEXT: psrad $16, %xmm6
1752-
; SSE2-NEXT: movdqa %xmm2, %xmm10
1747+
; SSE2-NEXT: movdqa %xmm4, %xmm10
17531748
; SSE2-NEXT: pxor %xmm9, %xmm10
1754-
; SSE2-NEXT: movdqa %xmm7, %xmm3
1755-
; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
1756-
; SSE2-NEXT: pand %xmm3, %xmm2
1757-
; SSE2-NEXT: pxor %xmm8, %xmm3
1758-
; SSE2-NEXT: por %xmm2, %xmm3
1759-
; SSE2-NEXT: pslld $16, %xmm3
1760-
; SSE2-NEXT: psrad $16, %xmm3
1761-
; SSE2-NEXT: packssdw %xmm6, %xmm3
1762-
; SSE2-NEXT: movdqa %xmm5, %xmm2
1763-
; SSE2-NEXT: pxor %xmm9, %xmm2
1749+
; SSE2-NEXT: movdqa %xmm7, %xmm5
1750+
; SSE2-NEXT: pcmpgtd %xmm10, %xmm5
1751+
; SSE2-NEXT: pand %xmm5, %xmm4
1752+
; SSE2-NEXT: pxor %xmm8, %xmm5
1753+
; SSE2-NEXT: por %xmm4, %xmm5
1754+
; SSE2-NEXT: pslld $16, %xmm5
1755+
; SSE2-NEXT: psrad $16, %xmm5
1756+
; SSE2-NEXT: packssdw %xmm6, %xmm5
1757+
; SSE2-NEXT: movdqa %xmm3, %xmm4
1758+
; SSE2-NEXT: pxor %xmm9, %xmm4
17641759
; SSE2-NEXT: movdqa %xmm7, %xmm6
1765-
; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
1766-
; SSE2-NEXT: pand %xmm6, %xmm5
1760+
; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
1761+
; SSE2-NEXT: pand %xmm6, %xmm3
17671762
; SSE2-NEXT: pxor %xmm8, %xmm6
1768-
; SSE2-NEXT: por %xmm5, %xmm6
1763+
; SSE2-NEXT: por %xmm3, %xmm6
17691764
; SSE2-NEXT: pslld $16, %xmm6
17701765
; SSE2-NEXT: psrad $16, %xmm6
1771-
; SSE2-NEXT: pxor %xmm4, %xmm9
1766+
; SSE2-NEXT: pxor %xmm2, %xmm9
17721767
; SSE2-NEXT: pcmpgtd %xmm9, %xmm7
17731768
; SSE2-NEXT: pxor %xmm7, %xmm8
1774-
; SSE2-NEXT: pand %xmm4, %xmm7
1769+
; SSE2-NEXT: pand %xmm2, %xmm7
17751770
; SSE2-NEXT: por %xmm8, %xmm7
17761771
; SSE2-NEXT: pslld $16, %xmm7
17771772
; SSE2-NEXT: psrad $16, %xmm7
17781773
; SSE2-NEXT: packssdw %xmm6, %xmm7
1779-
; SSE2-NEXT: psubusw %xmm7, %xmm1
1780-
; SSE2-NEXT: psubusw %xmm3, %xmm0
1781-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1782-
; SSE2-NEXT: psrad $16, %xmm2
1783-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1784-
; SSE2-NEXT: psrad $16, %xmm0
1785-
; SSE2-NEXT: packssdw %xmm2, %xmm0
1786-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1787-
; SSE2-NEXT: psrad $16, %xmm2
1788-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1789-
; SSE2-NEXT: psrad $16, %xmm1
1790-
; SSE2-NEXT: packssdw %xmm2, %xmm1
1774+
; SSE2-NEXT: psubusw %xmm7, %xmm0
1775+
; SSE2-NEXT: psubusw %xmm5, %xmm1
17911776
; SSE2-NEXT: retq
17921777
;
17931778
; SSSE3-LABEL: psubus_16i32_max:
17941779
; SSSE3: # %bb.0: # %vector.ph
17951780
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
1796-
; SSSE3-NEXT: movdqa %xmm3, %xmm8
1781+
; SSSE3-NEXT: movdqa %xmm5, %xmm8
17971782
; SSSE3-NEXT: pxor %xmm9, %xmm8
17981783
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183]
17991784
; SSSE3-NEXT: movdqa %xmm7, %xmm6
18001785
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6
18011786
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8
1802-
; SSSE3-NEXT: pand %xmm6, %xmm3
1787+
; SSSE3-NEXT: pand %xmm6, %xmm5
18031788
; SSSE3-NEXT: pxor %xmm8, %xmm6
1804-
; SSSE3-NEXT: por %xmm3, %xmm6
1789+
; SSSE3-NEXT: por %xmm5, %xmm6
18051790
; SSSE3-NEXT: pslld $16, %xmm6
18061791
; SSSE3-NEXT: psrad $16, %xmm6
1807-
; SSSE3-NEXT: movdqa %xmm2, %xmm10
1792+
; SSSE3-NEXT: movdqa %xmm4, %xmm10
18081793
; SSSE3-NEXT: pxor %xmm9, %xmm10
1809-
; SSSE3-NEXT: movdqa %xmm7, %xmm3
1810-
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
1811-
; SSSE3-NEXT: pand %xmm3, %xmm2
1812-
; SSSE3-NEXT: pxor %xmm8, %xmm3
1813-
; SSSE3-NEXT: por %xmm2, %xmm3
1814-
; SSSE3-NEXT: pslld $16, %xmm3
1815-
; SSSE3-NEXT: psrad $16, %xmm3
1816-
; SSSE3-NEXT: packssdw %xmm6, %xmm3
1817-
; SSSE3-NEXT: movdqa %xmm5, %xmm2
1818-
; SSSE3-NEXT: pxor %xmm9, %xmm2
1794+
; SSSE3-NEXT: movdqa %xmm7, %xmm5
1795+
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5
1796+
; SSSE3-NEXT: pand %xmm5, %xmm4
1797+
; SSSE3-NEXT: pxor %xmm8, %xmm5
1798+
; SSSE3-NEXT: por %xmm4, %xmm5
1799+
; SSSE3-NEXT: pslld $16, %xmm5
1800+
; SSSE3-NEXT: psrad $16, %xmm5
1801+
; SSSE3-NEXT: packssdw %xmm6, %xmm5
1802+
; SSSE3-NEXT: movdqa %xmm3, %xmm4
1803+
; SSSE3-NEXT: pxor %xmm9, %xmm4
18191804
; SSSE3-NEXT: movdqa %xmm7, %xmm6
1820-
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
1821-
; SSSE3-NEXT: pand %xmm6, %xmm5
1805+
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
1806+
; SSSE3-NEXT: pand %xmm6, %xmm3
18221807
; SSSE3-NEXT: pxor %xmm8, %xmm6
1823-
; SSSE3-NEXT: por %xmm5, %xmm6
1808+
; SSSE3-NEXT: por %xmm3, %xmm6
18241809
; SSSE3-NEXT: pslld $16, %xmm6
18251810
; SSSE3-NEXT: psrad $16, %xmm6
1826-
; SSSE3-NEXT: pxor %xmm4, %xmm9
1811+
; SSSE3-NEXT: pxor %xmm2, %xmm9
18271812
; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7
18281813
; SSSE3-NEXT: pxor %xmm7, %xmm8
1829-
; SSSE3-NEXT: pand %xmm4, %xmm7
1814+
; SSSE3-NEXT: pand %xmm2, %xmm7
18301815
; SSSE3-NEXT: por %xmm8, %xmm7
18311816
; SSSE3-NEXT: pslld $16, %xmm7
18321817
; SSSE3-NEXT: psrad $16, %xmm7
18331818
; SSSE3-NEXT: packssdw %xmm6, %xmm7
1834-
; SSSE3-NEXT: psubusw %xmm7, %xmm1
1835-
; SSSE3-NEXT: psubusw %xmm3, %xmm0
1836-
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1837-
; SSSE3-NEXT: psrad $16, %xmm2
1838-
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1839-
; SSSE3-NEXT: psrad $16, %xmm0
1840-
; SSSE3-NEXT: packssdw %xmm2, %xmm0
1841-
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1842-
; SSSE3-NEXT: psrad $16, %xmm2
1843-
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1844-
; SSSE3-NEXT: psrad $16, %xmm1
1845-
; SSSE3-NEXT: packssdw %xmm2, %xmm1
1819+
; SSSE3-NEXT: psubusw %xmm7, %xmm0
1820+
; SSSE3-NEXT: psubusw %xmm5, %xmm1
18461821
; SSSE3-NEXT: retq
18471822
;
18481823
; SSE41-LABEL: psubus_16i32_max:
@@ -1923,11 +1898,6 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin
19231898
; SSE2-NEXT: psrad $16, %xmm5
19241899
; SSE2-NEXT: packssdw %xmm6, %xmm5
19251900
; SSE2-NEXT: psubusw %xmm5, %xmm0
1926-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1927-
; SSE2-NEXT: psrad $16, %xmm1
1928-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1929-
; SSE2-NEXT: psrad $16, %xmm0
1930-
; SSE2-NEXT: packssdw %xmm1, %xmm0
19311901
; SSE2-NEXT: retq
19321902
;
19331903
; SSSE3-LABEL: psubus_i16_i32_max_swapped:

0 commit comments

Comments
 (0)