Skip to content

Commit 7e44208

Browse files
committed
[X86][SSE] combineSubToSubus - add v16i32 handling on pre-AVX512BW targets.
v16i32 -> v16i16/v8i16 truncation is now good enough using PACKSS/PACKUS + shuffle combining that its no longer necessary to early-out on pre-AVX512BW targets. This was noticed while looking at completing PR40111 and moving combineSubToSubus to DAGCombine entirely.
1 parent c4944a6 commit 7e44208

File tree

2 files changed

+114
-135
lines changed

2 files changed

+114
-135
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48756,9 +48756,9 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
4875648756

4875748757
// PSUBUS is supported, starting from SSE2.
4875848758
EVT EltVT = VT.getVectorElementType();
48759-
if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16 ||
48760-
VT == MVT::v8i32 || VT == MVT::v8i64)) &&
48761-
!(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
48759+
if (!(Subtarget.hasSSE2() &&
48760+
(EltVT == MVT::i8 || EltVT == MVT::i16 || VT == MVT::v8i32 ||
48761+
VT == MVT::v8i64 || VT == MVT::v16i32)))
4876248762
return SDValue();
4876348763

4876448764
SDValue SubusLHS, SubusRHS;
@@ -48795,8 +48795,8 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
4879548795
SDValue MinRHS = Op1.getOperand(0).getOperand(1);
4879648796
EVT TruncVT = Op1.getOperand(0).getValueType();
4879748797
if (!(Subtarget.hasSSE2() &&
48798-
(TruncVT == MVT::v8i32 || TruncVT == MVT::v8i64)) &&
48799-
!(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
48798+
(TruncVT == MVT::v8i32 || TruncVT == MVT::v8i64 ||
48799+
TruncVT == MVT::v16i32)))
4880048800
return SDValue();
4880148801
SDValue OpToSaturate;
4880248802
if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&

llvm/test/CodeGen/X86/psubus.ll

Lines changed: 109 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -1737,141 +1737,125 @@ vector.ph:
17371737
define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
17381738
; SSE2-LABEL: psubus_16i32_max:
17391739
; SSE2: # %bb.0: # %vector.ph
1740-
; SSE2-NEXT: movdqa %xmm1, %xmm8
1741-
; SSE2-NEXT: pxor %xmm7, %xmm7
1742-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
1743-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
1744-
; SSE2-NEXT: movdqa %xmm0, %xmm10
1745-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
1746-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
1747-
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
1748-
; SSE2-NEXT: movdqa %xmm3, %xmm6
1749-
; SSE2-NEXT: pxor %xmm7, %xmm6
1750-
; SSE2-NEXT: movdqa %xmm0, %xmm9
1751-
; SSE2-NEXT: por %xmm7, %xmm9
1752-
; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
1753-
; SSE2-NEXT: pand %xmm9, %xmm0
1754-
; SSE2-NEXT: pandn %xmm3, %xmm9
1755-
; SSE2-NEXT: por %xmm0, %xmm9
1756-
; SSE2-NEXT: movdqa %xmm2, %xmm6
1757-
; SSE2-NEXT: pxor %xmm7, %xmm6
1758-
; SSE2-NEXT: movdqa %xmm10, %xmm0
1759-
; SSE2-NEXT: por %xmm7, %xmm0
1760-
; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
1761-
; SSE2-NEXT: pand %xmm0, %xmm10
1762-
; SSE2-NEXT: pandn %xmm2, %xmm0
1763-
; SSE2-NEXT: por %xmm10, %xmm0
1764-
; SSE2-NEXT: movdqa %xmm5, %xmm10
1765-
; SSE2-NEXT: pxor %xmm7, %xmm10
1766-
; SSE2-NEXT: movdqa %xmm8, %xmm6
1767-
; SSE2-NEXT: por %xmm7, %xmm6
1768-
; SSE2-NEXT: pcmpgtd %xmm10, %xmm6
1769-
; SSE2-NEXT: pand %xmm6, %xmm8
1770-
; SSE2-NEXT: pandn %xmm5, %xmm6
1771-
; SSE2-NEXT: por %xmm8, %xmm6
1772-
; SSE2-NEXT: movdqa %xmm4, %xmm8
1773-
; SSE2-NEXT: pxor %xmm7, %xmm8
1774-
; SSE2-NEXT: por %xmm1, %xmm7
1775-
; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
1776-
; SSE2-NEXT: pand %xmm7, %xmm1
1777-
; SSE2-NEXT: pandn %xmm4, %xmm7
1778-
; SSE2-NEXT: por %xmm7, %xmm1
1779-
; SSE2-NEXT: psubd %xmm4, %xmm1
1780-
; SSE2-NEXT: psubd %xmm5, %xmm6
1781-
; SSE2-NEXT: psubd %xmm2, %xmm0
1782-
; SSE2-NEXT: psubd %xmm3, %xmm9
1783-
; SSE2-NEXT: pslld $16, %xmm9
1784-
; SSE2-NEXT: psrad $16, %xmm9
1785-
; SSE2-NEXT: pslld $16, %xmm0
1786-
; SSE2-NEXT: psrad $16, %xmm0
1787-
; SSE2-NEXT: packssdw %xmm9, %xmm0
1740+
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
1741+
; SSE2-NEXT: movdqa %xmm3, %xmm8
1742+
; SSE2-NEXT: pxor %xmm9, %xmm8
1743+
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183]
1744+
; SSE2-NEXT: movdqa %xmm7, %xmm6
1745+
; SSE2-NEXT: pcmpgtd %xmm8, %xmm6
1746+
; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
1747+
; SSE2-NEXT: pand %xmm6, %xmm3
1748+
; SSE2-NEXT: pxor %xmm8, %xmm6
1749+
; SSE2-NEXT: por %xmm3, %xmm6
17881750
; SSE2-NEXT: pslld $16, %xmm6
17891751
; SSE2-NEXT: psrad $16, %xmm6
1790-
; SSE2-NEXT: pslld $16, %xmm1
1752+
; SSE2-NEXT: movdqa %xmm2, %xmm10
1753+
; SSE2-NEXT: pxor %xmm9, %xmm10
1754+
; SSE2-NEXT: movdqa %xmm7, %xmm3
1755+
; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
1756+
; SSE2-NEXT: pand %xmm3, %xmm2
1757+
; SSE2-NEXT: pxor %xmm8, %xmm3
1758+
; SSE2-NEXT: por %xmm2, %xmm3
1759+
; SSE2-NEXT: pslld $16, %xmm3
1760+
; SSE2-NEXT: psrad $16, %xmm3
1761+
; SSE2-NEXT: packssdw %xmm6, %xmm3
1762+
; SSE2-NEXT: movdqa %xmm5, %xmm2
1763+
; SSE2-NEXT: pxor %xmm9, %xmm2
1764+
; SSE2-NEXT: movdqa %xmm7, %xmm6
1765+
; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
1766+
; SSE2-NEXT: pand %xmm6, %xmm5
1767+
; SSE2-NEXT: pxor %xmm8, %xmm6
1768+
; SSE2-NEXT: por %xmm5, %xmm6
1769+
; SSE2-NEXT: pslld $16, %xmm6
1770+
; SSE2-NEXT: psrad $16, %xmm6
1771+
; SSE2-NEXT: pxor %xmm4, %xmm9
1772+
; SSE2-NEXT: pcmpgtd %xmm9, %xmm7
1773+
; SSE2-NEXT: pxor %xmm7, %xmm8
1774+
; SSE2-NEXT: pand %xmm4, %xmm7
1775+
; SSE2-NEXT: por %xmm8, %xmm7
1776+
; SSE2-NEXT: pslld $16, %xmm7
1777+
; SSE2-NEXT: psrad $16, %xmm7
1778+
; SSE2-NEXT: packssdw %xmm6, %xmm7
1779+
; SSE2-NEXT: psubusw %xmm7, %xmm1
1780+
; SSE2-NEXT: psubusw %xmm3, %xmm0
1781+
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1782+
; SSE2-NEXT: psrad $16, %xmm2
1783+
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1784+
; SSE2-NEXT: psrad $16, %xmm0
1785+
; SSE2-NEXT: packssdw %xmm2, %xmm0
1786+
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1787+
; SSE2-NEXT: psrad $16, %xmm2
1788+
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
17911789
; SSE2-NEXT: psrad $16, %xmm1
1792-
; SSE2-NEXT: packssdw %xmm6, %xmm1
1790+
; SSE2-NEXT: packssdw %xmm2, %xmm1
17931791
; SSE2-NEXT: retq
17941792
;
17951793
; SSSE3-LABEL: psubus_16i32_max:
17961794
; SSSE3: # %bb.0: # %vector.ph
1797-
; SSSE3-NEXT: movdqa %xmm1, %xmm8
1798-
; SSSE3-NEXT: pxor %xmm7, %xmm7
1799-
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
1800-
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
1801-
; SSSE3-NEXT: movdqa %xmm0, %xmm10
1802-
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
1803-
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
1804-
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
1805-
; SSSE3-NEXT: movdqa %xmm3, %xmm6
1806-
; SSSE3-NEXT: pxor %xmm7, %xmm6
1807-
; SSSE3-NEXT: movdqa %xmm0, %xmm9
1808-
; SSSE3-NEXT: por %xmm7, %xmm9
1809-
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9
1810-
; SSSE3-NEXT: pand %xmm9, %xmm0
1811-
; SSSE3-NEXT: pandn %xmm3, %xmm9
1812-
; SSSE3-NEXT: por %xmm0, %xmm9
1813-
; SSSE3-NEXT: movdqa %xmm2, %xmm6
1814-
; SSSE3-NEXT: pxor %xmm7, %xmm6
1815-
; SSSE3-NEXT: movdqa %xmm10, %xmm0
1816-
; SSSE3-NEXT: por %xmm7, %xmm0
1817-
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
1818-
; SSSE3-NEXT: pand %xmm0, %xmm10
1819-
; SSSE3-NEXT: pandn %xmm2, %xmm0
1820-
; SSSE3-NEXT: por %xmm10, %xmm0
1821-
; SSSE3-NEXT: movdqa %xmm5, %xmm10
1822-
; SSSE3-NEXT: pxor %xmm7, %xmm10
1823-
; SSSE3-NEXT: movdqa %xmm8, %xmm6
1824-
; SSSE3-NEXT: por %xmm7, %xmm6
1825-
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm6
1826-
; SSSE3-NEXT: pand %xmm6, %xmm8
1827-
; SSSE3-NEXT: pandn %xmm5, %xmm6
1828-
; SSSE3-NEXT: por %xmm8, %xmm6
1829-
; SSSE3-NEXT: movdqa %xmm4, %xmm8
1830-
; SSSE3-NEXT: pxor %xmm7, %xmm8
1831-
; SSSE3-NEXT: por %xmm1, %xmm7
1832-
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7
1833-
; SSSE3-NEXT: pand %xmm7, %xmm1
1834-
; SSSE3-NEXT: pandn %xmm4, %xmm7
1835-
; SSSE3-NEXT: por %xmm7, %xmm1
1836-
; SSSE3-NEXT: psubd %xmm4, %xmm1
1837-
; SSSE3-NEXT: psubd %xmm5, %xmm6
1838-
; SSSE3-NEXT: psubd %xmm2, %xmm0
1839-
; SSSE3-NEXT: psubd %xmm3, %xmm9
1840-
; SSSE3-NEXT: pslld $16, %xmm9
1841-
; SSSE3-NEXT: psrad $16, %xmm9
1842-
; SSSE3-NEXT: pslld $16, %xmm0
1843-
; SSSE3-NEXT: psrad $16, %xmm0
1844-
; SSSE3-NEXT: packssdw %xmm9, %xmm0
1795+
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
1796+
; SSSE3-NEXT: movdqa %xmm3, %xmm8
1797+
; SSSE3-NEXT: pxor %xmm9, %xmm8
1798+
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183]
1799+
; SSSE3-NEXT: movdqa %xmm7, %xmm6
1800+
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6
1801+
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8
1802+
; SSSE3-NEXT: pand %xmm6, %xmm3
1803+
; SSSE3-NEXT: pxor %xmm8, %xmm6
1804+
; SSSE3-NEXT: por %xmm3, %xmm6
1805+
; SSSE3-NEXT: pslld $16, %xmm6
1806+
; SSSE3-NEXT: psrad $16, %xmm6
1807+
; SSSE3-NEXT: movdqa %xmm2, %xmm10
1808+
; SSSE3-NEXT: pxor %xmm9, %xmm10
1809+
; SSSE3-NEXT: movdqa %xmm7, %xmm3
1810+
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
1811+
; SSSE3-NEXT: pand %xmm3, %xmm2
1812+
; SSSE3-NEXT: pxor %xmm8, %xmm3
1813+
; SSSE3-NEXT: por %xmm2, %xmm3
1814+
; SSSE3-NEXT: pslld $16, %xmm3
1815+
; SSSE3-NEXT: psrad $16, %xmm3
1816+
; SSSE3-NEXT: packssdw %xmm6, %xmm3
1817+
; SSSE3-NEXT: movdqa %xmm5, %xmm2
1818+
; SSSE3-NEXT: pxor %xmm9, %xmm2
1819+
; SSSE3-NEXT: movdqa %xmm7, %xmm6
1820+
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
1821+
; SSSE3-NEXT: pand %xmm6, %xmm5
1822+
; SSSE3-NEXT: pxor %xmm8, %xmm6
1823+
; SSSE3-NEXT: por %xmm5, %xmm6
18451824
; SSSE3-NEXT: pslld $16, %xmm6
18461825
; SSSE3-NEXT: psrad $16, %xmm6
1847-
; SSSE3-NEXT: pslld $16, %xmm1
1826+
; SSSE3-NEXT: pxor %xmm4, %xmm9
1827+
; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7
1828+
; SSSE3-NEXT: pxor %xmm7, %xmm8
1829+
; SSSE3-NEXT: pand %xmm4, %xmm7
1830+
; SSSE3-NEXT: por %xmm8, %xmm7
1831+
; SSSE3-NEXT: pslld $16, %xmm7
1832+
; SSSE3-NEXT: psrad $16, %xmm7
1833+
; SSSE3-NEXT: packssdw %xmm6, %xmm7
1834+
; SSSE3-NEXT: psubusw %xmm7, %xmm1
1835+
; SSSE3-NEXT: psubusw %xmm3, %xmm0
1836+
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1837+
; SSSE3-NEXT: psrad $16, %xmm2
1838+
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1839+
; SSSE3-NEXT: psrad $16, %xmm0
1840+
; SSSE3-NEXT: packssdw %xmm2, %xmm0
1841+
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1842+
; SSSE3-NEXT: psrad $16, %xmm2
1843+
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
18481844
; SSSE3-NEXT: psrad $16, %xmm1
1849-
; SSSE3-NEXT: packssdw %xmm6, %xmm1
1845+
; SSSE3-NEXT: packssdw %xmm2, %xmm1
18501846
; SSSE3-NEXT: retq
18511847
;
18521848
; SSE41-LABEL: psubus_16i32_max:
18531849
; SSE41: # %bb.0: # %vector.ph
1854-
; SSE41-NEXT: pxor %xmm8, %xmm8
1855-
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1856-
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
1857-
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1858-
; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1859-
; SSE41-NEXT: pmaxud %xmm3, %xmm0
1860-
; SSE41-NEXT: pmaxud %xmm2, %xmm7
1861-
; SSE41-NEXT: pmaxud %xmm5, %xmm1
1862-
; SSE41-NEXT: pmaxud %xmm4, %xmm6
1863-
; SSE41-NEXT: psubd %xmm4, %xmm6
1864-
; SSE41-NEXT: psubd %xmm5, %xmm1
1865-
; SSE41-NEXT: psubd %xmm2, %xmm7
1866-
; SSE41-NEXT: psubd %xmm3, %xmm0
1867-
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2],xmm8[3],xmm0[4],xmm8[5],xmm0[6],xmm8[7]
1868-
; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2],xmm8[3],xmm7[4],xmm8[5],xmm7[6],xmm8[7]
1869-
; SSE41-NEXT: packusdw %xmm0, %xmm7
1870-
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2],xmm8[3],xmm1[4],xmm8[5],xmm1[6],xmm8[7]
1871-
; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2],xmm8[3],xmm6[4],xmm8[5],xmm6[6],xmm8[7]
1872-
; SSE41-NEXT: packusdw %xmm1, %xmm6
1873-
; SSE41-NEXT: movdqa %xmm7, %xmm0
1874-
; SSE41-NEXT: movdqa %xmm6, %xmm1
1850+
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
1851+
; SSE41-NEXT: pminud %xmm6, %xmm5
1852+
; SSE41-NEXT: pminud %xmm6, %xmm4
1853+
; SSE41-NEXT: packusdw %xmm5, %xmm4
1854+
; SSE41-NEXT: pminud %xmm6, %xmm3
1855+
; SSE41-NEXT: pminud %xmm6, %xmm2
1856+
; SSE41-NEXT: packusdw %xmm3, %xmm2
1857+
; SSE41-NEXT: psubusw %xmm2, %xmm0
1858+
; SSE41-NEXT: psubusw %xmm4, %xmm1
18751859
; SSE41-NEXT: retq
18761860
;
18771861
; AVX1-LABEL: psubus_16i32_max:
@@ -1894,16 +1878,11 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
18941878
; AVX2-LABEL: psubus_16i32_max:
18951879
; AVX2: # %bb.0: # %vector.ph
18961880
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535]
1897-
; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
1898-
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
1899-
; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
19001881
; AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2
1901-
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1902-
; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1903-
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
1904-
; AVX2-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
1905-
; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1906-
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1882+
; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
1883+
; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
1884+
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
1885+
; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
19071886
; AVX2-NEXT: retq
19081887
;
19091888
; AVX512-LABEL: psubus_16i32_max:

0 commit comments

Comments
 (0)