Skip to content

Commit 58016f0

Browse files
alexey-bataevyuxuanchen1997
authored andcommitted
[SLP]Do not vectorize small (<=2) buildvector/buildvalue sequences with MaxVF==true.
Summary: If MaxVFOnly for buildvector/buildvalue vectorization is set to true and the total number of elements to vectorize is <= 2, better to try to vectorize reductions at first, which may produce larger tree (reductions have a limit of at least 4 elements to vectorize). Smaller buildvector/buildvalue sequence will be attempted to vectorize later, with MaxVFOnly set to false. Test Plan: Reviewers: Reviewed By: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251539
1 parent fd7bb37 commit 58016f0

File tree

3 files changed

+35
-21
lines changed

3 files changed

+35
-21
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18285,6 +18285,14 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
1828518285
if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
1828618286
return false;
1828718287

18288+
if (MaxVFOnly && BuildVectorOpds.size() == 2) {
18289+
R.getORE()->emit([&]() {
18290+
return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
18291+
<< "Cannot SLP vectorize list: only 2 elements of buildvalue, "
18292+
"trying reduction first.";
18293+
});
18294+
return false;
18295+
}
1828818296
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
1828918297
// Aggregate value is unlikely to be processed in vector register.
1829018298
return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
@@ -18301,6 +18309,14 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
1830118309
isFixedVectorShuffle(BuildVectorOpds, Mask)))
1830218310
return false;
1830318311

18312+
if (MaxVFOnly && BuildVectorInsts.size() == 2) {
18313+
R.getORE()->emit([&]() {
18314+
return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
18315+
<< "Cannot SLP vectorize list: only 2 elements of buildvector, "
18316+
"trying reduction first.";
18317+
});
18318+
return false;
18319+
}
1830418320
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
1830518321
return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
1830618322
}

llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,33 +10,32 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float
1010
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
1111
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1212
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
13-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
14-
; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
15-
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
16-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
17-
; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[TMP7]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
13+
; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
14+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0
15+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1
16+
; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
1817
; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
1918
; CHECK: bb18:
20-
; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP8]], [[BB:%.*]] ]
21-
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
22-
; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP10]], 2.000000e+00
23-
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
24-
; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP11]], 3.000000e+00
19+
; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ]
20+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
21+
; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00
22+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
23+
; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00
2524
; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
2625
; CHECK: bb25:
27-
; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP9]], [[BB18]] ]
28-
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
26+
; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ]
27+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
2928
; CHECK-NEXT: br label [[BB30:%.*]]
3029
; CHECK: bb30:
3130
; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
32-
; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP13]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
33-
; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
34-
; CHECK-NEXT: [[TMP15:%.*]] = uitofp <4 x i8> [[TMP14]] to <4 x float>
35-
; CHECK-NEXT: [[TMP16:%.*]] = fsub fast <4 x float> [[TMP15]], [[TMP3]]
36-
; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <4 x float> [[TMP16]], [[TMP12]]
37-
; CHECK-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP17]])
31+
; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP11]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
32+
; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
33+
; CHECK-NEXT: [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
34+
; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
35+
; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP10]]
36+
; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
3837
; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
39-
; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP18]])
38+
; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP16]])
4039
; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]])
4140
; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]]
4241
; CHECK: bb57:

llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77
; YAML-NEXT: Name: NotPossible
88
; YAML-NEXT: Function: g
99
; YAML-NEXT: Args:
10-
; YAML-NEXT: - String: 'Cannot SLP vectorize list: vectorization was impossible'
11-
; YAML-NEXT: - String: ' with available vectorization factors'
10+
; YAML-NEXT: - String: 'Cannot SLP vectorize list: only 2 elements of buildvector, trying reduction first.'
1211

1312
define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) {
1413
; CHECK-LABEL: @g(

0 commit comments

Comments
 (0)