Skip to content

Commit 86dfbc6

Browse files
committed
[SLP] add another bailout for load-combine patterns
This builds on the or-reduction bailout that was added with D67841. We still do not have IR-level load combining, although that could be a target-specific enhancement for -vector-combiner. The heuristic is narrowly defined to catch the motivating case from PR39538: https://bugs.llvm.org/show_bug.cgi?id=39538 ...while preserving existing functionality. That is, there's an unmodified test of pure load/zext/store that is not seen in this patch at llvm/test/Transforms/SLPVectorizer/X86/cast.ll. That's the reason for the logic difference to require the 'or' instructions. The chances that vectorization would actually help a memory-bound sequence like that seem small, but it looks nicer with: vpmovzxwd (%rsi), %xmm0 vmovdqu %xmm0, (%rdi) rather than: movzwl (%rsi), %eax movl %eax, (%rdi) ... In the motivating test, we avoid creating a vector mess that is unrecoverable in the backend, and SDAG forms the expected bswap instructions after load combining: movzbl (%rdi), %eax vmovd %eax, %xmm0 movzbl 1(%rdi), %eax vmovd %eax, %xmm1 movzbl 2(%rdi), %eax vpinsrb $4, 4(%rdi), %xmm0, %xmm0 vpinsrb $8, 8(%rdi), %xmm0, %xmm0 vpinsrb $12, 12(%rdi), %xmm0, %xmm0 vmovd %eax, %xmm2 movzbl 3(%rdi), %eax vpinsrb $1, 5(%rdi), %xmm1, %xmm1 vpinsrb $2, 9(%rdi), %xmm1, %xmm1 vpinsrb $3, 13(%rdi), %xmm1, %xmm1 vpslld $24, %xmm0, %xmm0 vpmovzxbd %xmm1, %xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero vpslld $16, %xmm1, %xmm1 vpor %xmm0, %xmm1, %xmm0 vpinsrb $1, 6(%rdi), %xmm2, %xmm1 vmovd %eax, %xmm2 vpinsrb $2, 10(%rdi), %xmm1, %xmm1 vpinsrb $3, 14(%rdi), %xmm1, %xmm1 vpinsrb $1, 7(%rdi), %xmm2, %xmm2 vpinsrb $2, 11(%rdi), %xmm2, %xmm2 vpmovzxbd %xmm1, %xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero vpinsrb $3, 15(%rdi), %xmm2, %xmm2 vpslld $8, %xmm1, %xmm1 vpmovzxbd %xmm2, %xmm2 # xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero vpor %xmm2, %xmm1, %xmm1 vpor %xmm1, %xmm0, %xmm0 vmovdqu %xmm0, (%rsi) movl (%rdi), %eax movl 4(%rdi), %ecx movl 8(%rdi), %edx movbel %eax, (%rsi) movbel %ecx, 4(%rsi) movl 12(%rdi), %ecx movbel %edx, 8(%rsi) movbel %ecx, 12(%rsi) Differential Revision: https://reviews.llvm.org/D78997
1 parent 8d0c3c0 commit 86dfbc6

File tree

2 files changed

+75
-33
lines changed

2 files changed

+75
-33
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,15 @@ class BoUpSLP {
666666
/// may not be necessary.
667667
bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
668668

669+
/// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
670+
/// can be load combined in the backend. Load combining may not be allowed in
671+
/// the IR optimizer, so we do not want to alter the pattern. For example,
672+
/// partially transforming a scalar bswap() pattern into vector code is
673+
/// effectively impossible for the backend to undo.
674+
/// TODO: If load combining is allowed in the IR optimizer, this analysis
675+
/// may not be necessary.
676+
bool isLoadCombineCandidate() const;
677+
669678
OptimizationRemarkEmitter *getORE() { return ORE; }
670679

671680
/// This structure holds any data we need about the edges being traversed
@@ -3673,8 +3682,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
36733682
return true;
36743683
}
36753684

3676-
static bool isLoadCombineCandidate(Value *Root, unsigned NumElts,
3677-
TargetTransformInfo *TTI) {
3685+
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
3686+
TargetTransformInfo *TTI) {
36783687
// Look past the root to find a source value. Arbitrarily follow the
36793688
// path through operand 0 of any 'or'. Also, peek through optional
36803689
// shift-left-by-constant.
@@ -3683,9 +3692,9 @@ static bool isLoadCombineCandidate(Value *Root, unsigned NumElts,
36833692
match(ZextLoad, m_Shl(m_Value(), m_Constant())))
36843693
ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
36853694

3686-
// Check if the input is an extended load.
3695+
// Check if the input is an extended load of the required or/shift expression.
36873696
Value *LoadPtr;
3688-
if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
3697+
if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
36893698
return false;
36903699

36913700
// Require that the total load bit width is a legal integer type.
@@ -3710,7 +3719,20 @@ bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
37103719

37113720
unsigned NumElts = VectorizableTree[0]->Scalars.size();
37123721
Value *FirstReduced = VectorizableTree[0]->Scalars[0];
3713-
return isLoadCombineCandidate(FirstReduced, NumElts, TTI);
3722+
return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
3723+
}
3724+
3725+
bool BoUpSLP::isLoadCombineCandidate() const {
3726+
// Peek through a final sequence of stores and check if all operations are
3727+
// likely to be load-combined.
3728+
unsigned NumElts = VectorizableTree[0]->Scalars.size();
3729+
for (Value *Scalar : VectorizableTree[0]->Scalars) {
3730+
Value *X;
3731+
if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
3732+
!isLoadCombineCandidateImpl(X, NumElts, TTI))
3733+
return false;
3734+
}
3735+
return true;
37143736
}
37153737

37163738
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
@@ -5758,6 +5780,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
57585780
}
57595781
if (R.isTreeTinyAndNotFullyVectorizable())
57605782
return false;
5783+
if (R.isLoadCombineCandidate())
5784+
return false;
57615785

57625786
R.computeMinimumValueSizes();
57635787

@@ -6010,6 +6034,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
60106034
}
60116035
if (R.isTreeTinyAndNotFullyVectorizable())
60126036
continue;
6037+
if (R.isLoadCombineCandidate())
6038+
return false;
60136039

60146040
R.computeMinimumValueSizes();
60156041
int Cost = R.getTreeCost() - UserCost;

llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll

Lines changed: 44 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -393,34 +393,50 @@ define void @PR39538(i8* %t0, i32* %t1) {
393393
; CHECK-NEXT: [[T63:%.*]] = load i8, i8* [[T62]], align 1
394394
; CHECK-NEXT: [[T68:%.*]] = load i8, i8* [[T67]], align 1
395395
; CHECK-NEXT: [[T73:%.*]] = load i8, i8* [[T72]], align 1
396-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> undef, i8 [[T3]], i32 0
397-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[T21]], i32 1
398-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[T40]], i32 2
399-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[T59]], i32 3
400-
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
401-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i8> undef, i8 [[T7]], i32 0
402-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i8> [[TMP6]], i8 [[T25]], i32 1
403-
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> [[TMP7]], i8 [[T44]], i32 2
404-
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[T63]], i32 3
405-
; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32>
406-
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> undef, i8 [[T12]], i32 0
407-
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i8> [[TMP11]], i8 [[T30]], i32 1
408-
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i8> [[TMP12]], i8 [[T49]], i32 2
409-
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[T68]], i32 3
410-
; CHECK-NEXT: [[TMP15:%.*]] = zext <4 x i8> [[TMP14]] to <4 x i32>
411-
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i8> undef, i8 [[T17]], i32 0
412-
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[T35]], i32 1
413-
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[T54]], i32 2
414-
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[T73]], i32 3
415-
; CHECK-NEXT: [[TMP20:%.*]] = zext <4 x i8> [[TMP19]] to <4 x i32>
416-
; CHECK-NEXT: [[TMP21:%.*]] = shl nuw <4 x i32> [[TMP5]], <i32 24, i32 24, i32 24, i32 24>
417-
; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], <i32 16, i32 16, i32 16, i32 16>
418-
; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw <4 x i32> [[TMP15]], <i32 8, i32 8, i32 8, i32 8>
419-
; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP22]], [[TMP21]]
420-
; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP24]], [[TMP23]]
421-
; CHECK-NEXT: [[TMP26:%.*]] = or <4 x i32> [[TMP25]], [[TMP20]]
422-
; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[T1]] to <4 x i32>*
423-
; CHECK-NEXT: store <4 x i32> [[TMP26]], <4 x i32>* [[TMP27]], align 4
396+
; CHECK-NEXT: [[T4:%.*]] = zext i8 [[T3]] to i32
397+
; CHECK-NEXT: [[T8:%.*]] = zext i8 [[T7]] to i32
398+
; CHECK-NEXT: [[T13:%.*]] = zext i8 [[T12]] to i32
399+
; CHECK-NEXT: [[T18:%.*]] = zext i8 [[T17]] to i32
400+
; CHECK-NEXT: [[T22:%.*]] = zext i8 [[T21]] to i32
401+
; CHECK-NEXT: [[T26:%.*]] = zext i8 [[T25]] to i32
402+
; CHECK-NEXT: [[T31:%.*]] = zext i8 [[T30]] to i32
403+
; CHECK-NEXT: [[T36:%.*]] = zext i8 [[T35]] to i32
404+
; CHECK-NEXT: [[T41:%.*]] = zext i8 [[T40]] to i32
405+
; CHECK-NEXT: [[T45:%.*]] = zext i8 [[T44]] to i32
406+
; CHECK-NEXT: [[T50:%.*]] = zext i8 [[T49]] to i32
407+
; CHECK-NEXT: [[T55:%.*]] = zext i8 [[T54]] to i32
408+
; CHECK-NEXT: [[T60:%.*]] = zext i8 [[T59]] to i32
409+
; CHECK-NEXT: [[T64:%.*]] = zext i8 [[T63]] to i32
410+
; CHECK-NEXT: [[T69:%.*]] = zext i8 [[T68]] to i32
411+
; CHECK-NEXT: [[T74:%.*]] = zext i8 [[T73]] to i32
412+
; CHECK-NEXT: [[T5:%.*]] = shl nuw i32 [[T4]], 24
413+
; CHECK-NEXT: [[T23:%.*]] = shl nuw i32 [[T22]], 24
414+
; CHECK-NEXT: [[T42:%.*]] = shl nuw i32 [[T41]], 24
415+
; CHECK-NEXT: [[T61:%.*]] = shl nuw i32 [[T60]], 24
416+
; CHECK-NEXT: [[T9:%.*]] = shl nuw nsw i32 [[T8]], 16
417+
; CHECK-NEXT: [[T27:%.*]] = shl nuw nsw i32 [[T26]], 16
418+
; CHECK-NEXT: [[T46:%.*]] = shl nuw nsw i32 [[T45]], 16
419+
; CHECK-NEXT: [[T65:%.*]] = shl nuw nsw i32 [[T64]], 16
420+
; CHECK-NEXT: [[T14:%.*]] = shl nuw nsw i32 [[T13]], 8
421+
; CHECK-NEXT: [[T32:%.*]] = shl nuw nsw i32 [[T31]], 8
422+
; CHECK-NEXT: [[T51:%.*]] = shl nuw nsw i32 [[T50]], 8
423+
; CHECK-NEXT: [[T70:%.*]] = shl nuw nsw i32 [[T69]], 8
424+
; CHECK-NEXT: [[T10:%.*]] = or i32 [[T9]], [[T5]]
425+
; CHECK-NEXT: [[T15:%.*]] = or i32 [[T10]], [[T14]]
426+
; CHECK-NEXT: [[T19:%.*]] = or i32 [[T15]], [[T18]]
427+
; CHECK-NEXT: [[T28:%.*]] = or i32 [[T27]], [[T23]]
428+
; CHECK-NEXT: [[T33:%.*]] = or i32 [[T28]], [[T32]]
429+
; CHECK-NEXT: [[T37:%.*]] = or i32 [[T33]], [[T36]]
430+
; CHECK-NEXT: [[T47:%.*]] = or i32 [[T46]], [[T42]]
431+
; CHECK-NEXT: [[T52:%.*]] = or i32 [[T47]], [[T51]]
432+
; CHECK-NEXT: [[T56:%.*]] = or i32 [[T52]], [[T55]]
433+
; CHECK-NEXT: [[T66:%.*]] = or i32 [[T65]], [[T61]]
434+
; CHECK-NEXT: [[T71:%.*]] = or i32 [[T66]], [[T70]]
435+
; CHECK-NEXT: [[T75:%.*]] = or i32 [[T71]], [[T74]]
436+
; CHECK-NEXT: store i32 [[T19]], i32* [[T1]], align 4
437+
; CHECK-NEXT: store i32 [[T37]], i32* [[T38]], align 4
438+
; CHECK-NEXT: store i32 [[T56]], i32* [[T57]], align 4
439+
; CHECK-NEXT: store i32 [[T75]], i32* [[T76]], align 4
424440
; CHECK-NEXT: ret void
425441
;
426442
%t6 = getelementptr inbounds i8, i8* %t0, i64 1

0 commit comments

Comments
 (0)