Skip to content

Commit 4ca81a9

Browse files
committed
[X86] Add a DAG combine to replace vector loads feeding a v4i32->v2f64 CVTSI2FP/CVTUI2FP node with a vzload.
But only when the load isn't volatile. This improves load folding during isel where we only have vzload and scalar_to_vector+load patterns. We can't have full vector load isel patterns for the same volatile load issue. Also add some missing masked cvtsi2fp/cvtui2fp with vzload patterns. llvm-svn: 364728
1 parent fc233c9 commit 4ca81a9

File tree

4 files changed

+56
-20
lines changed

4 files changed

+56
-20
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41101,6 +41101,34 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
4110141101
KnownZero, DCI))
4110241102
return SDValue(N, 0);
4110341103

41104+
// Convert a full vector load into vzload when not all bits are needed.
41105+
SDValue In = N->getOperand(0);
41106+
MVT InVT = In.getSimpleValueType();
41107+
if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
41108+
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
41109+
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
41110+
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
41111+
// Unless the load is volatile.
41112+
if (!LN->isVolatile()) {
41113+
SDLoc dl(N);
41114+
unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
41115+
MVT MemVT = MVT::getIntegerVT(NumBits);
41116+
MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
41117+
SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
41118+
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41119+
SDValue VZLoad =
41120+
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
41121+
LN->getPointerInfo(),
41122+
LN->getAlignment(),
41123+
LN->getMemOperand()->getFlags());
41124+
SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
41125+
DAG.getBitcast(InVT, VZLoad));
41126+
DCI.CombineTo(N, Convert);
41127+
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41128+
return SDValue(N, 0);
41129+
}
41130+
}
41131+
4110441132
return SDValue();
4110541133
}
4110641134

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8429,9 +8429,25 @@ def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
84298429
let Predicates = [HasVLX] in {
84308430
def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
84318431
(VCVTDQ2PDZ128rm addr:$src)>;
8432+
def : Pat<(v2f64 (vselect VK2WM:$mask,
8433+
(X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
8434+
VR128X:$src0)),
8435+
(VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
8436+
def : Pat<(v2f64 (vselect VK2WM:$mask,
8437+
(X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
8438+
v2f64x_info.ImmAllZerosV)),
8439+
(VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
84328440

84338441
def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
84348442
(VCVTUDQ2PDZ128rm addr:$src)>;
8443+
def : Pat<(v2f64 (vselect VK2WM:$mask,
8444+
(X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
8445+
VR128X:$src0)),
8446+
(VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
8447+
def : Pat<(v2f64 (vselect VK2WM:$mask,
8448+
(X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
8449+
v2f64x_info.ImmAllZerosV)),
8450+
(VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
84358451
}
84368452

84378453
let Predicates = [HasDQI, HasVLX] in {

llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3122,14 +3122,12 @@ define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) {
31223122
define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
31233123
; SSE-LABEL: sitofp_load_4i32_to_2f64_2:
31243124
; SSE: # %bb.0:
3125-
; SSE-NEXT: movaps (%rdi), %xmm0
3126-
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
3125+
; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
31273126
; SSE-NEXT: retq
31283127
;
31293128
; AVX-LABEL: sitofp_load_4i32_to_2f64_2:
31303129
; AVX: # %bb.0:
3131-
; AVX-NEXT: vmovaps (%rdi), %xmm0
3132-
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
3130+
; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
31333131
; AVX-NEXT: retq
31343132
%a = load <4 x i32>, <4 x i32>* %x
31353133
%b = sitofp <4 x i32> %a to <4 x double>
@@ -3597,30 +3595,28 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
35973595
;
35983596
; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2:
35993597
; AVX512F: # %bb.0:
3600-
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
3598+
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
36013599
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
36023600
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
36033601
; AVX512F-NEXT: vzeroupper
36043602
; AVX512F-NEXT: retq
36053603
;
36063604
; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2:
36073605
; AVX512VL: # %bb.0:
3608-
; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
3609-
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
3606+
; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0
36103607
; AVX512VL-NEXT: retq
36113608
;
36123609
; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2:
36133610
; AVX512DQ: # %bb.0:
3614-
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
3611+
; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
36153612
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
36163613
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
36173614
; AVX512DQ-NEXT: vzeroupper
36183615
; AVX512DQ-NEXT: retq
36193616
;
36203617
; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2:
36213618
; AVX512VLDQ: # %bb.0:
3622-
; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0
3623-
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
3619+
; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0
36243620
; AVX512VLDQ-NEXT: retq
36253621
%a = load <4 x i32>, <4 x i32>* %x
36263622
%b = uitofp <4 x i32> %a to <4 x double>

llvm/test/CodeGen/X86/vec_int_to_fp.ll

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3122,14 +3122,12 @@ define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) {
31223122
define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
31233123
; SSE-LABEL: sitofp_load_4i32_to_2f64_2:
31243124
; SSE: # %bb.0:
3125-
; SSE-NEXT: movaps (%rdi), %xmm0
3126-
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
3125+
; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
31273126
; SSE-NEXT: retq
31283127
;
31293128
; AVX-LABEL: sitofp_load_4i32_to_2f64_2:
31303129
; AVX: # %bb.0:
3131-
; AVX-NEXT: vmovaps (%rdi), %xmm0
3132-
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
3130+
; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
31333131
; AVX-NEXT: retq
31343132
%a = load <4 x i32>, <4 x i32>* %x
31353133
%b = sitofp <4 x i32> %a to <4 x double>
@@ -3595,30 +3593,28 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
35953593
;
35963594
; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2:
35973595
; AVX512F: # %bb.0:
3598-
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
3596+
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
35993597
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
36003598
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
36013599
; AVX512F-NEXT: vzeroupper
36023600
; AVX512F-NEXT: retq
36033601
;
36043602
; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2:
36053603
; AVX512VL: # %bb.0:
3606-
; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
3607-
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
3604+
; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0
36083605
; AVX512VL-NEXT: retq
36093606
;
36103607
; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2:
36113608
; AVX512DQ: # %bb.0:
3612-
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
3609+
; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
36133610
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
36143611
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
36153612
; AVX512DQ-NEXT: vzeroupper
36163613
; AVX512DQ-NEXT: retq
36173614
;
36183615
; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2:
36193616
; AVX512VLDQ: # %bb.0:
3620-
; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0
3621-
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
3617+
; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0
36223618
; AVX512VLDQ-NEXT: retq
36233619
%a = load <4 x i32>, <4 x i32>* %x
36243620
%b = uitofp <4 x i32> %a to <4 x double>

0 commit comments

Comments
 (0)