Skip to content

Commit b2ef948

Browse files
committed
[X86][AVX1] Split 256-bit vector non-temporal loads to keep it non-temporal (PR32744)
Differential Revision: https://reviews.llvm.org/D33728 llvm-svn: 304718
1 parent a25bf0b commit b2ef948

File tree

3 files changed

+220
-106
lines changed

3 files changed

+220
-106
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6391,6 +6391,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
63916391
/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
63926392
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
63936393
const SDLoc &DL, SelectionDAG &DAG,
6394+
const X86Subtarget &Subtarget,
63946395
bool isAfterLegalize) {
63956396
unsigned NumElems = Elts.size();
63966397

@@ -6495,6 +6496,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
64956496
if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
64966497
return SDValue();
64976498

6499+
// Don't create 256-bit non-temporal aligned loads without AVX2 as these
6500+
// will lower to regular temporal loads and use the cache.
6501+
if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6502+
VT.is256BitVector() && !Subtarget.hasInt256())
6503+
return SDValue();
6504+
64986505
if (IsConsecutiveLoad)
64996506
return CreateLoad(VT, LDBase);
65006507

@@ -7701,7 +7708,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
77017708
// See if we can use a vector load to get all of the elements.
77027709
if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
77037710
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7704-
if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7711+
if (SDValue LD =
7712+
EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
77057713
return LD;
77067714
}
77077715

@@ -28784,7 +28792,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
2878428792
}
2878528793

2878628794
if (Elts.size() == VT.getVectorNumElements())
28787-
if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
28795+
if (SDValue LD =
28796+
EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
2878828797
return LD;
2878928798

2879028799
// For AVX2, we sometimes want to combine
@@ -32377,15 +32386,17 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
3237732386
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3237832387

3237932388
// For chips with slow 32-byte unaligned loads, break the 32-byte operation
32380-
// into two 16-byte operations.
32389+
// into two 16-byte operations. Also split non-temporal aligned loads on
32390+
// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
3238132391
ISD::LoadExtType Ext = Ld->getExtensionType();
3238232392
bool Fast;
3238332393
unsigned AddressSpace = Ld->getAddressSpace();
3238432394
unsigned Alignment = Ld->getAlignment();
3238532395
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
3238632396
Ext == ISD::NON_EXTLOAD &&
32387-
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32388-
AddressSpace, Alignment, &Fast) && !Fast) {
32397+
((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
32398+
(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32399+
AddressSpace, Alignment, &Fast) && !Fast))) {
3238932400
unsigned NumElems = RegVT.getVectorNumElements();
3239032401
if (NumElems < 2)
3239132402
return SDValue();
@@ -35093,7 +35104,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
3509335104
if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
3509435105
OpVT, AS, Alignment, &Fast) && Fast) {
3509535106
SDValue Ops[] = {SubVec2, SubVec};
35096-
if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
35107+
if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
35108+
Subtarget, false))
3509735109
return Ld;
3509835110
}
3509935111
}

llvm/test/CodeGen/X86/fast-isel-nontemporal.ll

Lines changed: 60 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -957,8 +957,16 @@ define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) {
957957
;
958958
; AVX1-LABEL: test_load_nt16xfloat:
959959
; AVX1: # BB#0: # %entry
960-
; AVX1-NEXT: vmovaps (%rdi), %ymm0
961-
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
960+
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
961+
; AVX1-NEXT: # implicit-def: %YMM1
962+
; AVX1-NEXT: vmovaps %xmm0, %xmm1
963+
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
964+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
965+
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
966+
; AVX1-NEXT: # implicit-def: %YMM1
967+
; AVX1-NEXT: vmovaps %xmm2, %xmm1
968+
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
969+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
962970
; AVX1-NEXT: retq
963971
;
964972
; AVX2-LABEL: test_load_nt16xfloat:
@@ -1003,8 +1011,16 @@ define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) {
10031011
;
10041012
; AVX1-LABEL: test_load_nt8xdouble:
10051013
; AVX1: # BB#0: # %entry
1006-
; AVX1-NEXT: vmovapd (%rdi), %ymm0
1007-
; AVX1-NEXT: vmovapd 32(%rdi), %ymm1
1014+
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1015+
; AVX1-NEXT: # implicit-def: %YMM1
1016+
; AVX1-NEXT: vmovaps %xmm0, %xmm1
1017+
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
1018+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1019+
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
1020+
; AVX1-NEXT: # implicit-def: %YMM1
1021+
; AVX1-NEXT: vmovaps %xmm2, %xmm1
1022+
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1023+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
10081024
; AVX1-NEXT: retq
10091025
;
10101026
; AVX2-LABEL: test_load_nt8xdouble:
@@ -1049,8 +1065,16 @@ define <64 x i8> @test_load_nt64xi8(<64 x i8>* nocapture %ptr) {
10491065
;
10501066
; AVX1-LABEL: test_load_nt64xi8:
10511067
; AVX1: # BB#0: # %entry
1052-
; AVX1-NEXT: vmovaps (%rdi), %ymm0
1053-
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
1068+
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1069+
; AVX1-NEXT: # implicit-def: %YMM1
1070+
; AVX1-NEXT: vmovaps %xmm0, %xmm1
1071+
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
1072+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1073+
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
1074+
; AVX1-NEXT: # implicit-def: %YMM1
1075+
; AVX1-NEXT: vmovaps %xmm2, %xmm1
1076+
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1077+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
10541078
; AVX1-NEXT: retq
10551079
;
10561080
; AVX2-LABEL: test_load_nt64xi8:
@@ -1101,8 +1125,16 @@ define <32 x i16> @test_load_nt32xi16(<32 x i16>* nocapture %ptr) {
11011125
;
11021126
; AVX1-LABEL: test_load_nt32xi16:
11031127
; AVX1: # BB#0: # %entry
1104-
; AVX1-NEXT: vmovaps (%rdi), %ymm0
1105-
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
1128+
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1129+
; AVX1-NEXT: # implicit-def: %YMM1
1130+
; AVX1-NEXT: vmovaps %xmm0, %xmm1
1131+
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
1132+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1133+
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
1134+
; AVX1-NEXT: # implicit-def: %YMM1
1135+
; AVX1-NEXT: vmovaps %xmm2, %xmm1
1136+
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1137+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
11061138
; AVX1-NEXT: retq
11071139
;
11081140
; AVX2-LABEL: test_load_nt32xi16:
@@ -1153,8 +1185,16 @@ define <16 x i32> @test_load_nt16xi32(<16 x i32>* nocapture %ptr) {
11531185
;
11541186
; AVX1-LABEL: test_load_nt16xi32:
11551187
; AVX1: # BB#0: # %entry
1156-
; AVX1-NEXT: vmovaps (%rdi), %ymm0
1157-
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
1188+
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1189+
; AVX1-NEXT: # implicit-def: %YMM1
1190+
; AVX1-NEXT: vmovaps %xmm0, %xmm1
1191+
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
1192+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1193+
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
1194+
; AVX1-NEXT: # implicit-def: %YMM1
1195+
; AVX1-NEXT: vmovaps %xmm2, %xmm1
1196+
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1197+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
11581198
; AVX1-NEXT: retq
11591199
;
11601200
; AVX2-LABEL: test_load_nt16xi32:
@@ -1199,8 +1239,16 @@ define <8 x i64> @test_load_nt8xi64(<8 x i64>* nocapture %ptr) {
11991239
;
12001240
; AVX1-LABEL: test_load_nt8xi64:
12011241
; AVX1: # BB#0: # %entry
1202-
; AVX1-NEXT: vmovaps (%rdi), %ymm0
1203-
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
1242+
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1243+
; AVX1-NEXT: # implicit-def: %YMM1
1244+
; AVX1-NEXT: vmovaps %xmm0, %xmm1
1245+
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
1246+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1247+
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
1248+
; AVX1-NEXT: # implicit-def: %YMM1
1249+
; AVX1-NEXT: vmovaps %xmm2, %xmm1
1250+
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1251+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
12041252
; AVX1-NEXT: retq
12051253
;
12061254
; AVX2-LABEL: test_load_nt8xi64:

0 commit comments

Comments
 (0)