Skip to content

Commit b51dc64

Browse files
committed
[X86] Add DAG combine to fold any_extend_vector_inreg+truncstore to an extractelement+store
We have custom code that ignores the normal promoting type legalization on less than 128-bit vector types like v4i8 to emit pavgb, paddusb, psubusb since we don't have the equivalent instruction on a larger element type like v4i32. If this operation appears before a store, we can be left with an any_extend_vector_inreg followed by a truncstore after type legalization. When truncstore isn't legal, this will normally be decomposed into shuffles and a non-truncating store. This will then combine away the any_extend_vector_inreg and shuffle leaving just the store. On avx512, truncstore is legal so we don't decompose it and we had no combines to fix it. This patch adds a new DAG combine to detect this case and emit either an extract_store for 64-bit stoers or a extractelement+store for 32 and 16 bit stores. This makes the avx512 codegen match the avx2 codegen for these situations. I'm restricting to only when -x86-experimental-vector-widening-legalization is false. When we're widening we're not likely to create this any_extend_inreg+truncstore combination. This means we should be able to remove this code when we flip the default. I would like to flip the default soon, but I need to investigate some performance regressions its causing in our branch that I wasn't seeing on trunk. Differential Revision: https://reviews.llvm.org/D65538 llvm-svn: 367488
1 parent c724215 commit b51dc64

File tree

9 files changed

+271
-817
lines changed

9 files changed

+271
-817
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40179,6 +40179,41 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
4017940179
MVT::v16i8, St->getMemOperand());
4018040180
}
4018140181

40182+
// Look for a truncating store to a less than 128 bit vector that has been
40183+
// truncated from an any_extend_inreg from a 128 bit vector with the same
40184+
// element size. We can use a 64/32/16-bit extractelement and store that.
40185+
// Disabling this when widening legalization is in effect since the trunc
40186+
// store would have been unlikely to be created in that case. Only doing this
40187+
// when truncstore is legal since it would otherwise be decomposed below and
40188+
// then combined away.
40189+
if (St->isTruncatingStore() && TLI.isTruncStoreLegal(VT, StVT) &&
40190+
StoredVal.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG &&
40191+
StoredVal.getValueType().is128BitVector() &&
40192+
!ExperimentalVectorWideningLegalization) {
40193+
EVT OrigVT = StoredVal.getOperand(0).getValueType();
40194+
if (OrigVT.is128BitVector() &&
40195+
OrigVT.getVectorElementType() == StVT.getVectorElementType()) {
40196+
unsigned StoreSize = StVT.getSizeInBits();
40197+
assert((128 % StoreSize == 0) && "Unexpected store size!");
40198+
MVT IntVT = MVT::getIntegerVT(StoreSize);
40199+
MVT CastVT = MVT::getVectorVT(IntVT, 128 / StoreSize);
40200+
StoredVal = DAG.getBitcast(CastVT, StoredVal.getOperand(0));
40201+
// Use extract_store for the 64-bit case to support 32-bit targets.
40202+
if (IntVT == MVT::i64) {
40203+
SDVTList Tys = DAG.getVTList(MVT::Other);
40204+
SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
40205+
return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
40206+
IntVT, St->getMemOperand());
40207+
}
40208+
40209+
// Otherwise just use an extract and store.
40210+
StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, IntVT, StoredVal,
40211+
DAG.getIntPtrConstant(0, dl));
40212+
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
40213+
St->getMemOperand());
40214+
}
40215+
}
40216+
4018240217
// Optimize trunc store (of multiple scalars) to shuffle and store.
4018340218
// First, pack all of the elements in one place. Next, store to memory
4018440219
// in fewer chunks.

llvm/test/CodeGen/X86/f16c-intrinsics.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -269,17 +269,13 @@ define void @test_x86_vcvtps2ph_128_m(<4 x i16>* nocapture %d, <4 x float> %a) n
269269
; X32-AVX512VL: # %bb.0: # %entry
270270
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
271271
; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
272-
; X32-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
273-
; X32-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
274-
; X32-AVX512VL-NEXT: vpmovdw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x00]
272+
; X32-AVX512VL-NEXT: vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00]
275273
; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
276274
;
277275
; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
278276
; X64-AVX512VL: # %bb.0: # %entry
279277
; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
280-
; X64-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
281-
; X64-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
282-
; X64-AVX512VL-NEXT: vpmovdw %xmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x07]
278+
; X64-AVX512VL-NEXT: vmovlps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x07]
283279
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
284280
entry:
285281
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a, i32 3)

llvm/test/CodeGen/X86/paddus.ll

Lines changed: 37 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -1527,30 +1527,13 @@ define void @addus_v8i8(<8 x i8>* %p1, <8 x i8>* %p2) {
15271527
; SSE-NEXT: movq %xmm1, (%rdi)
15281528
; SSE-NEXT: retq
15291529
;
1530-
; AVX1-LABEL: addus_v8i8:
1531-
; AVX1: # %bb.0:
1532-
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1533-
; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1534-
; AVX1-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1535-
; AVX1-NEXT: vmovq %xmm0, (%rdi)
1536-
; AVX1-NEXT: retq
1537-
;
1538-
; AVX2-LABEL: addus_v8i8:
1539-
; AVX2: # %bb.0:
1540-
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1541-
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1542-
; AVX2-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1543-
; AVX2-NEXT: vmovq %xmm0, (%rdi)
1544-
; AVX2-NEXT: retq
1545-
;
1546-
; AVX512-LABEL: addus_v8i8:
1547-
; AVX512: # %bb.0:
1548-
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1549-
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1550-
; AVX512-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1551-
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1552-
; AVX512-NEXT: vpmovwb %xmm0, (%rdi)
1553-
; AVX512-NEXT: retq
1530+
; AVX-LABEL: addus_v8i8:
1531+
; AVX: # %bb.0:
1532+
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1533+
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1534+
; AVX-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1535+
; AVX-NEXT: vmovq %xmm0, (%rdi)
1536+
; AVX-NEXT: retq
15541537
%ld1 = load <8 x i8>, <8 x i8>* %p1, align 8
15551538
%ld2 = load <8 x i8>, <8 x i8>* %p2, align 8
15561539
%1 = add <8 x i8> %ld2, %ld1
@@ -1569,30 +1552,13 @@ define void @addus_v4i8(<4 x i8>* %p1, <4 x i8>* %p2) {
15691552
; SSE-NEXT: movd %xmm1, (%rdi)
15701553
; SSE-NEXT: retq
15711554
;
1572-
; AVX1-LABEL: addus_v4i8:
1573-
; AVX1: # %bb.0:
1574-
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1575-
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1576-
; AVX1-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1577-
; AVX1-NEXT: vmovd %xmm0, (%rdi)
1578-
; AVX1-NEXT: retq
1579-
;
1580-
; AVX2-LABEL: addus_v4i8:
1581-
; AVX2: # %bb.0:
1582-
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1583-
; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1584-
; AVX2-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1585-
; AVX2-NEXT: vmovd %xmm0, (%rdi)
1586-
; AVX2-NEXT: retq
1587-
;
1588-
; AVX512-LABEL: addus_v4i8:
1589-
; AVX512: # %bb.0:
1590-
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1591-
; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1592-
; AVX512-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1593-
; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1594-
; AVX512-NEXT: vpmovdb %xmm0, (%rdi)
1595-
; AVX512-NEXT: retq
1555+
; AVX-LABEL: addus_v4i8:
1556+
; AVX: # %bb.0:
1557+
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1558+
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1559+
; AVX-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1560+
; AVX-NEXT: vmovd %xmm0, (%rdi)
1561+
; AVX-NEXT: retq
15961562
%ld1 = load <4 x i8>, <4 x i8>* %p1, align 4
15971563
%ld2 = load <4 x i8>, <4 x i8>* %p2, align 4
15981564
%1 = add <4 x i8> %ld2, %ld1
@@ -1635,36 +1601,15 @@ define void @addus_v2i8(<2 x i8>* %p1, <2 x i8>* %p2) {
16351601
; SSE41-NEXT: pextrw $0, %xmm1, (%rdi)
16361602
; SSE41-NEXT: retq
16371603
;
1638-
; AVX1-LABEL: addus_v2i8:
1639-
; AVX1: # %bb.0:
1640-
; AVX1-NEXT: movzwl (%rdi), %eax
1641-
; AVX1-NEXT: vmovd %eax, %xmm0
1642-
; AVX1-NEXT: movzwl (%rsi), %eax
1643-
; AVX1-NEXT: vmovd %eax, %xmm1
1644-
; AVX1-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1645-
; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
1646-
; AVX1-NEXT: retq
1647-
;
1648-
; AVX2-LABEL: addus_v2i8:
1649-
; AVX2: # %bb.0:
1650-
; AVX2-NEXT: movzwl (%rdi), %eax
1651-
; AVX2-NEXT: vmovd %eax, %xmm0
1652-
; AVX2-NEXT: movzwl (%rsi), %eax
1653-
; AVX2-NEXT: vmovd %eax, %xmm1
1654-
; AVX2-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1655-
; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
1656-
; AVX2-NEXT: retq
1657-
;
1658-
; AVX512-LABEL: addus_v2i8:
1659-
; AVX512: # %bb.0:
1660-
; AVX512-NEXT: movzwl (%rdi), %eax
1661-
; AVX512-NEXT: vmovd %eax, %xmm0
1662-
; AVX512-NEXT: movzwl (%rsi), %eax
1663-
; AVX512-NEXT: vmovd %eax, %xmm1
1664-
; AVX512-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1665-
; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1666-
; AVX512-NEXT: vpmovqb %xmm0, (%rdi)
1667-
; AVX512-NEXT: retq
1604+
; AVX-LABEL: addus_v2i8:
1605+
; AVX: # %bb.0:
1606+
; AVX-NEXT: movzwl (%rdi), %eax
1607+
; AVX-NEXT: vmovd %eax, %xmm0
1608+
; AVX-NEXT: movzwl (%rsi), %eax
1609+
; AVX-NEXT: vmovd %eax, %xmm1
1610+
; AVX-NEXT: vpaddusb %xmm0, %xmm1, %xmm0
1611+
; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
1612+
; AVX-NEXT: retq
16681613
%ld1 = load <2 x i8>, <2 x i8>* %p1, align 2
16691614
%ld2 = load <2 x i8>, <2 x i8>* %p2, align 2
16701615
%1 = add <2 x i8> %ld2, %ld1
@@ -1683,30 +1628,13 @@ define void @addus_v4i16(<4 x i16>* %p1, <4 x i16>* %p2) {
16831628
; SSE-NEXT: movq %xmm1, (%rdi)
16841629
; SSE-NEXT: retq
16851630
;
1686-
; AVX1-LABEL: addus_v4i16:
1687-
; AVX1: # %bb.0:
1688-
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1689-
; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1690-
; AVX1-NEXT: vpaddusw %xmm0, %xmm1, %xmm0
1691-
; AVX1-NEXT: vmovq %xmm0, (%rdi)
1692-
; AVX1-NEXT: retq
1693-
;
1694-
; AVX2-LABEL: addus_v4i16:
1695-
; AVX2: # %bb.0:
1696-
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1697-
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1698-
; AVX2-NEXT: vpaddusw %xmm0, %xmm1, %xmm0
1699-
; AVX2-NEXT: vmovq %xmm0, (%rdi)
1700-
; AVX2-NEXT: retq
1701-
;
1702-
; AVX512-LABEL: addus_v4i16:
1703-
; AVX512: # %bb.0:
1704-
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1705-
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1706-
; AVX512-NEXT: vpaddusw %xmm0, %xmm1, %xmm0
1707-
; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1708-
; AVX512-NEXT: vpmovdw %xmm0, (%rdi)
1709-
; AVX512-NEXT: retq
1631+
; AVX-LABEL: addus_v4i16:
1632+
; AVX: # %bb.0:
1633+
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1634+
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1635+
; AVX-NEXT: vpaddusw %xmm0, %xmm1, %xmm0
1636+
; AVX-NEXT: vmovq %xmm0, (%rdi)
1637+
; AVX-NEXT: retq
17101638
%ld1 = load <4 x i16>, <4 x i16>* %p1, align 4
17111639
%ld2 = load <4 x i16>, <4 x i16>* %p2, align 4
17121640
%1 = add <4 x i16> %ld2, %ld1
@@ -1725,30 +1653,13 @@ define void @addus_v2i16(<2 x i16>* %p1, <2 x i16>* %p2) {
17251653
; SSE-NEXT: movd %xmm1, (%rdi)
17261654
; SSE-NEXT: retq
17271655
;
1728-
; AVX1-LABEL: addus_v2i16:
1729-
; AVX1: # %bb.0:
1730-
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1731-
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1732-
; AVX1-NEXT: vpaddusw %xmm0, %xmm1, %xmm0
1733-
; AVX1-NEXT: vmovd %xmm0, (%rdi)
1734-
; AVX1-NEXT: retq
1735-
;
1736-
; AVX2-LABEL: addus_v2i16:
1737-
; AVX2: # %bb.0:
1738-
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1739-
; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1740-
; AVX2-NEXT: vpaddusw %xmm0, %xmm1, %xmm0
1741-
; AVX2-NEXT: vmovd %xmm0, (%rdi)
1742-
; AVX2-NEXT: retq
1743-
;
1744-
; AVX512-LABEL: addus_v2i16:
1745-
; AVX512: # %bb.0:
1746-
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1747-
; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1748-
; AVX512-NEXT: vpaddusw %xmm0, %xmm1, %xmm0
1749-
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1750-
; AVX512-NEXT: vpmovqw %xmm0, (%rdi)
1751-
; AVX512-NEXT: retq
1656+
; AVX-LABEL: addus_v2i16:
1657+
; AVX: # %bb.0:
1658+
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1659+
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1660+
; AVX-NEXT: vpaddusw %xmm0, %xmm1, %xmm0
1661+
; AVX-NEXT: vmovd %xmm0, (%rdi)
1662+
; AVX-NEXT: retq
17521663
%ld1 = load <2 x i16>, <2 x i16>* %p1, align 2
17531664
%ld2 = load <2 x i16>, <2 x i16>* %p2, align 2
17541665
%1 = add <2 x i16> %ld2, %ld1

0 commit comments

Comments
 (0)