Skip to content

Commit 8489c8e

Browse files
committed
[X86] shouldReduceLoadWidth - don't split loads if ANY uses are a extract+store or a full width legal binop
Currently shouldReduceLoadWidth is very relaxed about when loads can be split to avoid extractions from the original full width load - resulting in many cases where the number of memory operations notably increases, replacing the cost of a extract_subvector for additional loads. This patch adjusts the 256/512-bit vector load splitting metric to not split if ANY use of the full width load can be used directly - either in an extract+store (previously ALL uses had to be extract+store to prevent splits) or is used by a legal binop (so unlikely to be split itself). This required a number of fixes - shouldReduceLoadWidth now needs to peek through bitcasts UP the use-chain to find final users (limited to hasOneUse cases to reduce complexity). It also exposed an issue in isTargetCanonicalConstantNode which assumed that a load of vector constant data would always extract, which is no longer the case.
1 parent 22d8ba3 commit 8489c8e

16 files changed

+1870
-1846
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3260,16 +3260,23 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
32603260
EVT NewVT) const {
32613261
assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
32623262

3263+
auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3264+
while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3265+
N = *N->user_begin();
3266+
return N;
3267+
};
3268+
32633269
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
32643270
// relocation target a movq or addq instruction: don't let the load shrink.
32653271
SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
32663272
if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
32673273
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
32683274
return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
32693275

3270-
// If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3276+
// If this is an (1) AVX vector load with (2) multiple uses and (3) any of
32713277
// those uses are extracted directly into a store, then the extract + store
3272-
// can be store-folded. Therefore, it's probably not worth splitting the load.
3278+
// can be store-folded, or (4) any use will be used by legal full width
3279+
// instruction. Then, it's probably not worth splitting the load.
32733280
EVT VT = Load->getValueType(0);
32743281
if ((VT.is256BitVector() || VT.is512BitVector()) &&
32753282
!SDValue(Load, 0).hasOneUse()) {
@@ -3278,15 +3285,23 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
32783285
if (Use.getResNo() != 0)
32793286
continue;
32803287

3281-
SDNode *User = Use.getUser();
3288+
const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
32823289

3283-
// If this use is not an extract + store, it's probably worth splitting.
3284-
if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !User->hasOneUse() ||
3285-
User->user_begin()->getOpcode() != ISD::STORE)
3286-
return true;
3290+
// If any use is an extract + store, it's probably not worth splitting.
3291+
if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3292+
all_of(User->uses(), [&](const SDUse &U) {
3293+
const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3294+
return Inner->getOpcode() == ISD::STORE;
3295+
}))
3296+
return false;
3297+
3298+
// If any use is a full width legal/target bin op, then assume its legal
3299+
// and shouldn't split.
3300+
if (isBinOp(User->getOpcode()) &&
3301+
(isOperationLegal(User->getOpcode(), VT) ||
3302+
User->getOpcode() > ISD::BUILTIN_OP_END))
3303+
return false;
32873304
}
3288-
// All non-chain uses are extract + store.
3289-
return false;
32903305
}
32913306

32923307
return true;

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1337,15 +1337,18 @@ namespace llvm {
13371337
unsigned Depth) const override;
13381338

13391339
bool isTargetCanonicalConstantNode(SDValue Op) const override {
1340-
// Peek through bitcasts/extracts/inserts to see if we have a broadcast
1341-
// vector from memory.
1340+
// Peek through bitcasts/extracts/inserts to see if we have a vector
1341+
// load/broadcast from memory.
13421342
while (Op.getOpcode() == ISD::BITCAST ||
13431343
Op.getOpcode() == ISD::EXTRACT_SUBVECTOR ||
13441344
(Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
13451345
Op.getOperand(0).isUndef()))
13461346
Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0);
13471347

13481348
return Op.getOpcode() == X86ISD::VBROADCAST_LOAD ||
1349+
Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
1350+
(Op.getOpcode() == ISD::LOAD &&
1351+
getTargetConstantFromLoad(cast<LoadSDNode>(Op))) ||
13491352
TargetLowering::isTargetCanonicalConstantNode(Op);
13501353
}
13511354

llvm/test/CodeGen/X86/oddsubvector.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -261,9 +261,9 @@ define void @PR42833() {
261261
;
262262
; AVX2-LABEL: PR42833:
263263
; AVX2: # %bb.0:
264-
; AVX2-NEXT: movl b(%rip), %eax
265264
; AVX2-NEXT: vmovdqu c+128(%rip), %ymm0
266-
; AVX2-NEXT: addl c+128(%rip), %eax
265+
; AVX2-NEXT: vmovd %xmm0, %eax
266+
; AVX2-NEXT: addl b(%rip), %eax
267267
; AVX2-NEXT: vmovd %eax, %xmm1
268268
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2
269269
; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3
@@ -284,10 +284,10 @@ define void @PR42833() {
284284
;
285285
; AVX512-LABEL: PR42833:
286286
; AVX512: # %bb.0:
287-
; AVX512-NEXT: movl b(%rip), %eax
288287
; AVX512-NEXT: vmovdqu c+128(%rip), %ymm0
289288
; AVX512-NEXT: vmovdqu64 c+128(%rip), %zmm1
290-
; AVX512-NEXT: addl c+128(%rip), %eax
289+
; AVX512-NEXT: vmovd %xmm0, %eax
290+
; AVX512-NEXT: addl b(%rip), %eax
291291
; AVX512-NEXT: vmovd %eax, %xmm2
292292
; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm2
293293
; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0

llvm/test/CodeGen/X86/setcc-lowering.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ define <8 x i16> @pr25080(<8 x i32> %a) nounwind {
1111
; AVX1-LABEL: pr25080:
1212
; AVX1: # %bb.0: # %entry
1313
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
14-
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
14+
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
1515
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1616
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
1717
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0

llvm/test/CodeGen/X86/vec_int_to_fp.ll

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4228,7 +4228,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
42284228
; AVX1: # %bb.0:
42294229
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
42304230
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1
4231-
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
4231+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
42324232
; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3
42334233
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
42344234
; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
@@ -4273,7 +4273,8 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
42734273
; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1
42744274
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
42754275
; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2
4276-
; AVX2-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0
4276+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
4277+
; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
42774278
; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
42784279
; AVX2-NEXT: vzeroupper
42794280
; AVX2-NEXT: retq
@@ -4658,7 +4659,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
46584659
; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,1,1,1]
46594660
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm3
46604661
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm4
4661-
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm5
4662+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
46624663
; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm6
46634664
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
46644665
; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3
@@ -4680,7 +4681,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
46804681
; AVX1-NEXT: vblendvps %xmm1, %xmm4, %xmm3, %xmm1
46814682
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm2
46824683
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm3
4683-
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4
4684+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
46844685
; AVX1-NEXT: vpsrlq $1, %xmm4, %xmm5
46854686
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
46864687
; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm2
@@ -4725,7 +4726,8 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
47254726
; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3
47264727
; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[0]
47274728
; AVX2-NEXT: vaddps %xmm3, %xmm3, %xmm4
4728-
; AVX2-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1
4729+
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
4730+
; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
47294731
; AVX2-NEXT: vblendvps %xmm1, %xmm4, %xmm3, %xmm1
47304732
; AVX2-NEXT: vandps %ymm2, %ymm0, %ymm2
47314733
; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm3
@@ -4744,7 +4746,8 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
47444746
; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm2
47454747
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0]
47464748
; AVX2-NEXT: vaddps %xmm2, %xmm2, %xmm3
4747-
; AVX2-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0
4749+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
4750+
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
47484751
; AVX2-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
47494752
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
47504753
; AVX2-NEXT: retq
@@ -4849,7 +4852,7 @@ define <8 x float> @uitofp_load_8i32_to_8f32(ptr%a) {
48494852
; AVX1: # %bb.0:
48504853
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
48514854
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
4852-
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
4855+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
48534856
; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
48544857
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
48554858
; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1

0 commit comments

Comments
 (0)