Skip to content

Commit bbf31a1

Browse files
committed
[X86] shouldReduceLoadWidth - don't split loads if ANY uses are a extract+store or a full width legal binop
Currently shouldReduceLoadWidth is very relaxed about when loads can be split to avoid extractions from the original full width load - resulting in many cases where the number of memory operations notably increases, replacing the cost of a extract_subvector for additional loads. This patch adjusts the 256/512-bit vector load splitting metric to not split if ANY use of the full width load can be used directly - either in an extract+store (previously ALL uses had to be extract+store to prevent splits) or is used by a legal binop (so unlikely to be split itself). This required a number of fixes - shouldReduceLoadWidth now needs to peek through bitcasts UP the use-chain to find final users (limited to hasOneUse cases to reduce complexity). It also exposed an issue in isTargetCanonicalConstantNode which assumed that a load of vector constant data would always extract, which is no longer the case.
1 parent 14b38cf commit bbf31a1

12 files changed

+2480
-2453
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3263,6 +3263,12 @@ bool X86TargetLowering::shouldReduceLoadWidth(
32633263
std::optional<unsigned> ByteOffset) const {
32643264
assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
32653265

3266+
auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3267+
while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3268+
N = *N->user_begin();
3269+
return N;
3270+
};
3271+
32663272
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
32673273
// relocation target a movq or addq instruction: don't let the load shrink.
32683274
SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
@@ -3272,24 +3278,46 @@ bool X86TargetLowering::shouldReduceLoadWidth(
32723278

32733279
// If this is an (1) AVX vector load with (2) multiple uses and (3) all of
32743280
// those uses are extracted directly into a store, then the extract + store
3275-
// can be store-folded. Therefore, it's probably not worth splitting the load.
3281+
// can be store-folded, or (4) any use will be used by legal full width
3282+
// instruction. Then, it's probably not worth splitting the load.
32763283
EVT VT = Load->getValueType(0);
32773284
if ((VT.is256BitVector() || VT.is512BitVector()) &&
32783285
!SDValue(Load, 0).hasOneUse()) {
3286+
bool FullWidthUse = false;
3287+
bool AllExtractStores = true;
32793288
for (SDUse &Use : Load->uses()) {
32803289
// Skip uses of the chain value. Result 0 of the node is the load value.
32813290
if (Use.getResNo() != 0)
32823291
continue;
32833292

3284-
SDNode *User = Use.getUser();
3293+
const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
32853294

3286-
// If this use is not an extract + store, it's probably worth splitting.
3287-
if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !User->hasOneUse() ||
3288-
User->user_begin()->getOpcode() != ISD::STORE)
3289-
return true;
3295+
// If this use is an extract + store, it's probably not worth splitting.
3296+
if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3297+
all_of(User->uses(), [&](const SDUse &U) {
3298+
const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3299+
return Inner->getOpcode() == ISD::STORE;
3300+
}))
3301+
continue;
3302+
3303+
AllExtractStores = false;
3304+
3305+
// If any use is a full width legal/target bin op, then assume its legal
3306+
// and won't split.
3307+
if (isBinOp(User->getOpcode()) &&
3308+
(isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3309+
User->getOpcode() > ISD::BUILTIN_OP_END))
3310+
FullWidthUse = true;
32903311
}
3291-
// All non-chain uses are extract + store.
3292-
return false;
3312+
3313+
if (AllExtractStores)
3314+
return false;
3315+
3316+
// If we have an user that uses the full vector width, then this use is
3317+
// only worth splitting if the offset isn't 0 (to avoid an
3318+
// EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3319+
if (FullWidthUse)
3320+
return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
32933321
}
32943322

32953323
return true;
@@ -59124,6 +59152,14 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5912459152
return Res;
5912559153
}
5912659154

59155+
// Match insertion of subvector load that perfectly aliases a base load.
59156+
if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59157+
ISD::isNormalLoad(SubVec.getNode()) &&
59158+
DAG.areNonVolatileConsecutiveLoads(
59159+
cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59160+
SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59161+
return Vec;
59162+
5912759163
return SDValue();
5912859164
}
5912959165

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1337,15 +1337,18 @@ namespace llvm {
13371337
unsigned Depth) const override;
13381338

13391339
bool isTargetCanonicalConstantNode(SDValue Op) const override {
1340-
// Peek through bitcasts/extracts/inserts to see if we have a broadcast
1341-
// vector from memory.
1340+
// Peek through bitcasts/extracts/inserts to see if we have a vector
1341+
// load/broadcast from memory.
13421342
while (Op.getOpcode() == ISD::BITCAST ||
13431343
Op.getOpcode() == ISD::EXTRACT_SUBVECTOR ||
13441344
(Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
13451345
Op.getOperand(0).isUndef()))
13461346
Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0);
13471347

13481348
return Op.getOpcode() == X86ISD::VBROADCAST_LOAD ||
1349+
Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
1350+
(Op.getOpcode() == ISD::LOAD &&
1351+
getTargetConstantFromLoad(cast<LoadSDNode>(Op))) ||
13491352
TargetLowering::isTargetCanonicalConstantNode(Op);
13501353
}
13511354

llvm/test/CodeGen/X86/oddsubvector.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -261,9 +261,9 @@ define void @PR42833() {
261261
;
262262
; AVX2-LABEL: PR42833:
263263
; AVX2: # %bb.0:
264-
; AVX2-NEXT: movl b(%rip), %eax
265264
; AVX2-NEXT: vmovdqu c+128(%rip), %ymm0
266-
; AVX2-NEXT: addl c+128(%rip), %eax
265+
; AVX2-NEXT: vmovd %xmm0, %eax
266+
; AVX2-NEXT: addl b(%rip), %eax
267267
; AVX2-NEXT: vmovd %eax, %xmm1
268268
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2
269269
; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3
@@ -284,10 +284,10 @@ define void @PR42833() {
284284
;
285285
; AVX512-LABEL: PR42833:
286286
; AVX512: # %bb.0:
287-
; AVX512-NEXT: movl b(%rip), %eax
288287
; AVX512-NEXT: vmovdqu c+128(%rip), %ymm0
289288
; AVX512-NEXT: vmovdqu64 c+128(%rip), %zmm1
290-
; AVX512-NEXT: addl c+128(%rip), %eax
289+
; AVX512-NEXT: vmovd %xmm0, %eax
290+
; AVX512-NEXT: addl b(%rip), %eax
291291
; AVX512-NEXT: vmovd %eax, %xmm2
292292
; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm2
293293
; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0

llvm/test/CodeGen/X86/setcc-lowering.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ define <8 x i16> @pr25080(<8 x i32> %a) nounwind {
1111
; AVX1-LABEL: pr25080:
1212
; AVX1: # %bb.0: # %entry
1313
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
14-
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
14+
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
1515
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1616
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
1717
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0

0 commit comments

Comments
 (0)