Skip to content

Commit 8ac6869

Browse files
committed
[Passes] Only run extra vector passes if loops have been vectorized.
This patch uses a similar trick as in D113947 to only run the extra passes after vectorization on functions where loops have been vectorized. The reason for running the 'extra vector passes' is simplification/unswitching of the runtime checks created by LV, there should be no need to run them if nothing got vectorized To do that, a new dummy analysis ShouldRunExtraVectorPasses has been added. If loops have been vectorized for a function, LV will cache the analysis. At the moment it uses MadeCFGChanges as proxy for loop vectorized, which isn't perfect (it could be too aggressive, e.g. because no runtime checks have been added), but should be good enough for now. The extra passes are now managed by a new FunctionPassManager that runs its passes only if ShouldRunExtraVectorPasses has been cached. Without this patch, `-extra-vectorizer-passes` has the following compile-time impact: NewPM-O3: +4.86% NewPM-ReleaseThinLTO: +3.56% NewPM-ReleaseLTO-g: +7.17% http://llvm-compile-time-tracker.com/compare.php?from=ead3979a92fc33add4710c4510d6906260dcb4ad&to=c292da649e2c6e88a31e702fdc474727d09c72bc&stat=instructions With this patch, that gets reduced to NewPM-O3: +1.43% NewPM-ReleaseThinLTO: +1.00% NewPM-ReleaseLTO-g: +1.58% http://llvm-compile-time-tracker.com/compare.php?from=ead3979a92fc33add4710c4510d6906260dcb4ad&to=e67d86b57810011cf285eb9aa1944781be6096f0&stat=instructions It is probably still too high to enable by default, but much better. Reviewed By: aeubanks Differential Revision: https://reviews.llvm.org/D115052
1 parent eeb4c48 commit 8ac6869

File tree

5 files changed

+79
-38
lines changed

5 files changed

+79
-38
lines changed

llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,38 @@ class TargetTransformInfo;
8080
extern cl::opt<bool> EnableLoopInterleaving;
8181
extern cl::opt<bool> EnableLoopVectorization;
8282

83+
/// A marker to determine if extra passes after loop vectorization should be
84+
/// run.
85+
struct ShouldRunExtraVectorPasses
86+
: public AnalysisInfoMixin<ShouldRunExtraVectorPasses> {
87+
static AnalysisKey Key;
88+
struct Result {
89+
bool invalidate(Function &F, const PreservedAnalyses &PA,
90+
FunctionAnalysisManager::Invalidator &) {
91+
// Check whether the analysis has been explicitly invalidated. Otherwise,
92+
// it remains preserved.
93+
auto PAC = PA.getChecker<ShouldRunExtraVectorPasses>();
94+
return !PAC.preservedWhenStateless();
95+
}
96+
};
97+
98+
Result run(Function &F, FunctionAnalysisManager &FAM) { return Result(); }
99+
};
100+
101+
/// A pass manager to run a set of extra function simplification passes after
102+
/// vectorization, if requested. LoopVectorize caches the
103+
/// ShouldRunExtraVectorPasses analysis to request extra simplifications, if
104+
/// they could be beneficial.
105+
struct ExtraVectorPassManager : public FunctionPassManager {
106+
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) {
107+
auto PA = PreservedAnalyses::all();
108+
if (AM.getCachedResult<ShouldRunExtraVectorPasses>(F))
109+
PA.intersect(FunctionPassManager::run(F, AM));
110+
PA.abandon<ShouldRunExtraVectorPasses>();
111+
return PA;
112+
}
113+
};
114+
83115
struct LoopVectorizeOptions {
84116
/// If false, consider all loops for interleaving.
85117
/// If true, only loops that explicitly request interleaving are considered.

llvm/lib/Passes/PassBuilder.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,26 +1234,28 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
12341234
FPM.addPass(InstCombinePass());
12351235

12361236
if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
1237+
ExtraVectorPassManager ExtraPasses;
12371238
// At higher optimization levels, try to clean up any runtime overlap and
12381239
// alignment checks inserted by the vectorizer. We want to track correlated
12391240
// runtime checks for two inner loops in the same outer loop, fold any
12401241
// common computations, hoist loop-invariant aspects out of any outer loop,
12411242
// and unswitch the runtime checks if possible. Once hoisted, we may have
12421243
// dead (or speculatable) control flows or more combining opportunities.
1243-
FPM.addPass(EarlyCSEPass());
1244-
FPM.addPass(CorrelatedValuePropagationPass());
1245-
FPM.addPass(InstCombinePass());
1244+
ExtraPasses.addPass(EarlyCSEPass());
1245+
ExtraPasses.addPass(CorrelatedValuePropagationPass());
1246+
ExtraPasses.addPass(InstCombinePass());
12461247
LoopPassManager LPM;
12471248
LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
12481249
LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
12491250
OptimizationLevel::O3));
1250-
FPM.addPass(
1251+
ExtraPasses.addPass(
12511252
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
1252-
FPM.addPass(createFunctionToLoopPassAdaptor(
1253+
ExtraPasses.addPass(createFunctionToLoopPassAdaptor(
12531254
std::move(LPM), EnableMSSALoopDependency,
12541255
/*UseBlockFrequencyInfo=*/true));
1255-
FPM.addPass(SimplifyCFGPass());
1256-
FPM.addPass(InstCombinePass());
1256+
ExtraPasses.addPass(SimplifyCFGPass());
1257+
ExtraPasses.addPass(InstCombinePass());
1258+
FPM.addPass(std::move(ExtraPasses));
12571259
}
12581260

12591261
// Now that we've formed fast to execute loop structures, we do further

llvm/lib/Passes/PassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ FUNCTION_ANALYSIS("regions", RegionInfoAnalysis())
168168
FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis())
169169
FUNCTION_ANALYSIS("opt-remark-emit", OptimizationRemarkEmitterAnalysis())
170170
FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis())
171+
FUNCTION_ANALYSIS("should-run-extra-vector-passes", ShouldRunExtraVectorPasses())
171172
FUNCTION_ANALYSIS("stack-safety-local", StackSafetyAnalysis())
172173
FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
173174
FUNCTION_ANALYSIS("targetir",

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,8 @@ class GeneratedRTChecks;
429429

430430
namespace llvm {
431431

432+
AnalysisKey ShouldRunExtraVectorPasses::Key;
433+
432434
/// InnerLoopVectorizer vectorizes loops which contain only one basic
433435
/// block to a specified vectorization factor (VF).
434436
/// This class performs the widening of scalars into vectors, or multiple
@@ -10433,7 +10435,16 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
1043310435
PA.preserve<LoopAnalysis>();
1043410436
PA.preserve<DominatorTreeAnalysis>();
1043510437
}
10436-
if (!Result.MadeCFGChange)
10438+
10439+
if (Result.MadeCFGChange) {
10440+
// Making CFG changes likely means a loop got vectorized. Indicate that
10441+
// extra simplification passes should be run.
10442+
// TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10443+
// be run if runtime checks have been added.
10444+
AM.getResult<ShouldRunExtraVectorPasses>(F);
10445+
PA.preserve<ShouldRunExtraVectorPasses>();
10446+
} else {
1043710447
PA.preserveSet<CFGAnalyses>();
10448+
}
1043810449
return PA;
1043910450
}

llvm/test/Other/opt-pipeline-vector-passes.ll

Lines changed: 25 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
1-
; RUN: opt -disable-verify -debug-pass-manager -passes='default<O1>' -S %s 2>&1 | FileCheck %s --check-prefixes=NEWPM_O1
2-
; RUN: opt -disable-verify -debug-pass-manager -passes='default<O2>' -S %s 2>&1 | FileCheck %s --check-prefixes=NEWPM_O2
3-
; RUN: opt -disable-verify -debug-pass-manager -passes='default<O2>' -extra-vectorizer-passes -S %s 2>&1 | FileCheck %s --check-prefixes=NEWPM_O2_EXTRA
1+
; RUN: opt -disable-verify -debug-pass-manager -passes='default<O1>' -force-vector-width=4 -S %s 2>&1 | FileCheck %s --check-prefixes=O1
2+
; RUN: opt -disable-verify -debug-pass-manager -passes='default<O2>' -force-vector-width=4 -S %s 2>&1 | FileCheck %s --check-prefixes=O2
3+
; RUN: opt -disable-verify -debug-pass-manager -passes='default<O2>' -force-vector-width=4 -extra-vectorizer-passes -S %s 2>&1 | FileCheck %s --check-prefixes=O2_EXTRA
4+
5+
; When the loop doesn't get vectorized, no extra vector passes should run.
6+
; RUN: opt -disable-verify -debug-pass-manager -passes='default<O2>' -force-vector-width=0 -extra-vectorizer-passes -S %s 2>&1 | FileCheck %s --check-prefixes=O2
47

58
; REQUIRES: asserts
69

10+
; The loop vectorizer still runs at both -O1/-O2 even with the
11+
; debug flag, but it only works on loops explicitly annotated
12+
; with pragmas.
13+
714
; SLP does not run at -O1. Loop vectorization runs, but it only
815
; works on loops explicitly annotated with pragmas.
9-
10-
; OLDPM_O1-LABEL: Pass Arguments:
11-
; OLDPM_O1: Loop Vectorization
12-
; OLDPM_O1-NOT: SLP Vectorizer
13-
; OLDPM_O1: Optimize scalar/vector ops
16+
; O1-LABEL: Running pass: LoopVectorizePass
17+
; O1-NOT: Running pass: SLPVectorizerPass
18+
; O1: Running pass: VectorCombinePass
1419

1520
; Everything runs at -O2.
1621
; O2-LABEL: Running pass: LoopVectorizePass
@@ -19,28 +24,18 @@
1924
; O2: Running pass: SLPVectorizerPass
2025
; O2: Running pass: VectorCombinePass
2126

22-
; There should be no difference with the new pass manager.
23-
; This is tested more thoroughly in other test files.
24-
25-
; NEWPM_O1-LABEL: Running pass: LoopVectorizePass
26-
; NEWPM_O1-NOT: Running pass: SLPVectorizerPass
27-
; NEWPM_O1: Running pass: VectorCombinePass
28-
29-
; NEWPM_O2-LABEL: Running pass: LoopVectorizePass
30-
; NEWPM_O2: Running pass: SLPVectorizerPass
31-
; NEWPM_O2: Running pass: VectorCombinePass
32-
33-
; NEWPM_O2_EXTRA-LABEL: Running pass: LoopVectorizePass
34-
; NEWPM_O2_EXTRA: Running pass: EarlyCSEPass
35-
; NEWPM_O2_EXTRA: Running pass: CorrelatedValuePropagationPass
36-
; NEWPM_O2_EXTRA: Running pass: InstCombinePass
37-
; NEWPM_O2_EXTRA: Running pass: LICMPass
38-
; NEWPM_O2_EXTRA: Running pass: SimpleLoopUnswitchPass
39-
; NEWPM_O2_EXTRA: Running pass: SimplifyCFGPass
40-
; NEWPM_O2_EXTRA: Running pass: InstCombinePass
41-
; NEWPM_O2_EXTRA: Running pass: SLPVectorizerPass
42-
; NEWPM_O2_EXTRA: Running pass: EarlyCSEPass
43-
; NEWPM_O2_EXTRA: Running pass: VectorCombinePass
27+
; Optionally run cleanup passes.
28+
; O2_EXTRA-LABEL: Running pass: LoopVectorizePass
29+
; O2_EXTRA: Running pass: EarlyCSEPass
30+
; O2_EXTRA: Running pass: CorrelatedValuePropagationPass
31+
; O2_EXTRA: Running pass: InstCombinePass
32+
; O2_EXTRA: Running pass: LICMPass
33+
; O2_EXTRA: Running pass: SimpleLoopUnswitchPass
34+
; O2_EXTRA: Running pass: SimplifyCFGPass
35+
; O2_EXTRA: Running pass: InstCombinePass
36+
; O2_EXTRA: Running pass: SLPVectorizerPass
37+
; O2_EXTRA: Running pass: EarlyCSEPass
38+
; O2_EXTRA: Running pass: VectorCombinePass
4439

4540
define i64 @f(i1 %cond, i32* %src, i32* %dst) {
4641
entry:

0 commit comments

Comments
 (0)