Skip to content

Commit 71a3168

Browse files
committed
[PassManager] adjust VectorCombine placement
The initial placement of vector-combine in the opt pipeline revealed phase ordering bugs: https://bugs.llvm.org/show_bug.cgi?id=45015 https://bugs.llvm.org/show_bug.cgi?id=42022 This patch contains a few independent changes: 1. Move the pass up in the pipeline, so it happens just after loop-vectorization. This is only to keep vectorization passes together in the pipeline at the moment. I don't have evidence of interaction between these yet. 2. Add an -early-cse pass after -vector-combine to clean up redundant ops. This was partly proposed as far back as rL219644 (which is why it's effectively being moved in the old PM code). This is important because the subsequent -instcombine doesn't work as well without EarlyCSE. With the CSE, -instcombine is able to squash shuffles together in 1 of the tests (because those are simple "select" shuffles). 3. Remove the -vector-combine pass that was running after SLP. We may want to do that eventually, but I don't have a test case to support it yet. Differential Revision: https://reviews.llvm.org/D75145
1 parent 95a94df commit 71a3168

File tree

10 files changed

+33
-54
lines changed

10 files changed

+33
-54
lines changed

llvm/lib/Passes/PassBuilder.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -966,12 +966,15 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
966966
OptimizePM.addPass(LoopVectorizePass(
967967
LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
968968

969+
// Enhance/cleanup vector code.
970+
OptimizePM.addPass(VectorCombinePass());
971+
OptimizePM.addPass(EarlyCSEPass());
972+
969973
// Eliminate loads by forwarding stores from the previous iteration to loads
970974
// of the current iteration.
971975
OptimizePM.addPass(LoopLoadEliminationPass());
972976

973977
// Cleanup after the loop optimization passes.
974-
OptimizePM.addPass(VectorCombinePass());
975978
OptimizePM.addPass(InstCombinePass());
976979

977980
// Now that we've formed fast to execute loop structures, we do further
@@ -990,10 +993,8 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
990993
sinkCommonInsts(true)));
991994

992995
// Optimize parallel scalar instruction chains into SIMD instructions.
993-
if (PTO.SLPVectorization) {
996+
if (PTO.SLPVectorization)
994997
OptimizePM.addPass(SLPVectorizerPass());
995-
OptimizePM.addPass(VectorCombinePass());
996-
}
997998

998999
OptimizePM.addPass(InstCombinePass());
9991000

llvm/lib/Transforms/IPO/PassManagerBuilder.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,8 @@ void PassManagerBuilder::populateModulePassManager(
729729
MPM.add(createLoopDistributePass());
730730

731731
MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
732+
MPM.add(createVectorCombinePass());
733+
MPM.add(createEarlyCSEPass());
732734

733735
// Eliminate loads by forwarding stores from the previous iteration to loads
734736
// of the current iteration.
@@ -739,7 +741,6 @@ void PassManagerBuilder::populateModulePassManager(
739741
// on -O1 and no #pragma is found). Would be good to have these two passes
740742
// as function calls, so that we can only pass them when the vectorizer
741743
// changed the code.
742-
MPM.add(createVectorCombinePass());
743744
addInstructionCombiningPass(MPM);
744745
if (OptLevel > 1 && ExtraVectorizerPasses) {
745746
// At higher optimization levels, try to clean up any runtime overlap and
@@ -748,7 +749,6 @@ void PassManagerBuilder::populateModulePassManager(
748749
// common computations, hoist loop-invariant aspects out of any outer loop,
749750
// and unswitch the runtime checks if possible. Once hoisted, we may have
750751
// dead (or speculatable) control flows or more combining opportunities.
751-
MPM.add(createEarlyCSEPass());
752752
MPM.add(createCorrelatedValuePropagationPass());
753753
addInstructionCombiningPass(MPM);
754754
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
@@ -766,7 +766,6 @@ void PassManagerBuilder::populateModulePassManager(
766766

767767
if (SLPVectorize) {
768768
MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
769-
MPM.add(createVectorCombinePass());
770769
if (OptLevel > 1 && ExtraVectorizerPasses) {
771770
MPM.add(createEarlyCSEPass());
772771
}

llvm/test/Other/new-pm-defaults.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -250,17 +250,15 @@
250250
; CHECK-O-NEXT: Running pass: LoopVectorizePass
251251
; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
252252
; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
253+
; CHECK-O-NEXT: Running pass: VectorCombinePass
254+
; CHECK-O-NEXT: Running pass: EarlyCSEPass
253255
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
254256
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
255-
; CHECK-O-NEXT: Running pass: VectorCombinePass
256257
; CHECK-O-NEXT: Running pass: InstCombinePass
257258
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
258259
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
259260
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
260261
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
261-
; CHECK-O2-NEXT: Running pass: VectorCombinePass
262-
; CHECK-O3-NEXT: Running pass: VectorCombinePass
263-
; CHECK-Os-NEXT: Running pass: VectorCombinePass
264262
; CHECK-O-NEXT: Running pass: InstCombinePass
265263
; CHECK-O-NEXT: Running pass: LoopUnrollPass
266264
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

llvm/test/Other/new-pm-thinlto-defaults.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -220,17 +220,15 @@
220220
; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
221221
; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
222222
; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
223+
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
224+
; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
223225
; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
224226
; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
225-
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
226227
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
227228
; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
228229
; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
229230
; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
230231
; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
231-
; CHECK-POSTLINK-O2-NEXT: Running pass: VectorCombinePass
232-
; CHECK-POSTLINK-O3-NEXT: Running pass: VectorCombinePass
233-
; CHECK-POSTLINK-Os-NEXT: Running pass: VectorCombinePass
234232
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
235233
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
236234
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass

llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -188,17 +188,15 @@
188188
; CHECK-O-NEXT: Finished {{.*}}Function pass manager run
189189
; CHECK-O-NEXT: Running pass: LoopDistributePass
190190
; CHECK-O-NEXT: Running pass: LoopVectorizePass
191+
; CHECK-O-NEXT: Running pass: VectorCombinePass
192+
; CHECK-O-NEXT: Running pass: EarlyCSEPass
191193
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
192194
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
193-
; CHECK-O-NEXT: Running pass: VectorCombinePass
194195
; CHECK-O-NEXT: Running pass: InstCombinePass
195196
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
196197
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
197198
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
198199
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
199-
; CHECK-O2-NEXT: Running pass: VectorCombinePass
200-
; CHECK-O3-NEXT: Running pass: VectorCombinePass
201-
; CHECK-Os-NEXT: Running pass: VectorCombinePass
202200
; CHECK-O-NEXT: Running pass: InstCombinePass
203201
; CHECK-O-NEXT: Running pass: LoopUnrollPass
204202
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -199,17 +199,15 @@
199199
; CHECK-O-NEXT: Finished {{.*}}Function pass manager run
200200
; CHECK-O-NEXT: Running pass: LoopDistributePass
201201
; CHECK-O-NEXT: Running pass: LoopVectorizePass
202+
; CHECK-O-NEXT: Running pass: VectorCombinePass
203+
; CHECK-O-NEXT: Running pass: EarlyCSEPass
202204
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
203205
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
204-
; CHECK-O-NEXT: Running pass: VectorCombinePass
205206
; CHECK-O-NEXT: Running pass: InstCombinePass
206207
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
207208
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
208209
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
209210
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
210-
; CHECK-O2-NEXT: Running pass: VectorCombinePass
211-
; CHECK-O3-NEXT: Running pass: VectorCombinePass
212-
; CHECK-Os-NEXT: Running pass: VectorCombinePass
213211
; CHECK-O-NEXT: Running pass: InstCombinePass
214212
; CHECK-O-NEXT: Running pass: LoopUnrollPass
215213
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

llvm/test/Other/opt-O2-pipeline.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -225,14 +225,15 @@
225225
; CHECK-NEXT: Optimization Remark Emitter
226226
; CHECK-NEXT: Inject TLI Mappings
227227
; CHECK-NEXT: Loop Vectorization
228+
; CHECK-NEXT: Optimize scalar/vector ops
229+
; CHECK-NEXT: Early CSE
228230
; CHECK-NEXT: Canonicalize natural loops
229231
; CHECK-NEXT: Scalar Evolution Analysis
230232
; CHECK-NEXT: Function Alias Analysis Results
231233
; CHECK-NEXT: Loop Access Analysis
232234
; CHECK-NEXT: Lazy Branch Probability Analysis
233235
; CHECK-NEXT: Lazy Block Frequency Analysis
234236
; CHECK-NEXT: Loop Load Elimination
235-
; CHECK-NEXT: Optimize scalar/vector ops
236237
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
237238
; CHECK-NEXT: Function Alias Analysis Results
238239
; CHECK-NEXT: Lazy Branch Probability Analysis
@@ -250,8 +251,6 @@
250251
; CHECK-NEXT: Lazy Block Frequency Analysis
251252
; CHECK-NEXT: Optimization Remark Emitter
252253
; CHECK-NEXT: SLP Vectorizer
253-
; CHECK-NEXT: Optimize scalar/vector ops
254-
; CHECK-NEXT: Function Alias Analysis Results
255254
; CHECK-NEXT: Optimization Remark Emitter
256255
; CHECK-NEXT: Combine redundant instructions
257256
; CHECK-NEXT: Canonicalize natural loops

llvm/test/Other/opt-O3-pipeline.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,14 +230,15 @@
230230
; CHECK-NEXT: Optimization Remark Emitter
231231
; CHECK-NEXT: Inject TLI Mappings
232232
; CHECK-NEXT: Loop Vectorization
233+
; CHECK-NEXT: Optimize scalar/vector ops
234+
; CHECK-NEXT: Early CSE
233235
; CHECK-NEXT: Canonicalize natural loops
234236
; CHECK-NEXT: Scalar Evolution Analysis
235237
; CHECK-NEXT: Function Alias Analysis Results
236238
; CHECK-NEXT: Loop Access Analysis
237239
; CHECK-NEXT: Lazy Branch Probability Analysis
238240
; CHECK-NEXT: Lazy Block Frequency Analysis
239241
; CHECK-NEXT: Loop Load Elimination
240-
; CHECK-NEXT: Optimize scalar/vector ops
241242
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
242243
; CHECK-NEXT: Function Alias Analysis Results
243244
; CHECK-NEXT: Lazy Branch Probability Analysis
@@ -255,8 +256,6 @@
255256
; CHECK-NEXT: Lazy Block Frequency Analysis
256257
; CHECK-NEXT: Optimization Remark Emitter
257258
; CHECK-NEXT: SLP Vectorizer
258-
; CHECK-NEXT: Optimize scalar/vector ops
259-
; CHECK-NEXT: Function Alias Analysis Results
260259
; CHECK-NEXT: Optimization Remark Emitter
261260
; CHECK-NEXT: Combine redundant instructions
262261
; CHECK-NEXT: Canonicalize natural loops

llvm/test/Other/opt-Os-pipeline.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,14 +212,15 @@
212212
; CHECK-NEXT: Optimization Remark Emitter
213213
; CHECK-NEXT: Inject TLI Mappings
214214
; CHECK-NEXT: Loop Vectorization
215+
; CHECK-NEXT: Optimize scalar/vector ops
216+
; CHECK-NEXT: Early CSE
215217
; CHECK-NEXT: Canonicalize natural loops
216218
; CHECK-NEXT: Scalar Evolution Analysis
217219
; CHECK-NEXT: Function Alias Analysis Results
218220
; CHECK-NEXT: Loop Access Analysis
219221
; CHECK-NEXT: Lazy Branch Probability Analysis
220222
; CHECK-NEXT: Lazy Block Frequency Analysis
221223
; CHECK-NEXT: Loop Load Elimination
222-
; CHECK-NEXT: Optimize scalar/vector ops
223224
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
224225
; CHECK-NEXT: Function Alias Analysis Results
225226
; CHECK-NEXT: Lazy Branch Probability Analysis
@@ -237,8 +238,6 @@
237238
; CHECK-NEXT: Lazy Block Frequency Analysis
238239
; CHECK-NEXT: Optimization Remark Emitter
239240
; CHECK-NEXT: SLP Vectorizer
240-
; CHECK-NEXT: Optimize scalar/vector ops
241-
; CHECK-NEXT: Function Alias Analysis Results
242241
; CHECK-NEXT: Optimization Remark Emitter
243242
; CHECK-NEXT: Combine redundant instructions
244243
; CHECK-NEXT: Canonicalize natural loops

llvm/test/Transforms/PhaseOrdering/X86/addsub.ll

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,15 @@
44

55
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
66

7-
; TODO: Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle.
7+
; Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle.
88
; That may require some coordination between VectorCombine, SLP, and other passes.
99
; The end goal is to get a single "vaddsubps" instruction for x86 with AVX.
1010

1111
define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
1212
; CHECK-LABEL: @PR45015(
1313
; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]]
1414
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[ARG]], [[ARG1]]
15-
; CHECK-NEXT: [[T8:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
16-
; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[ARG]], [[ARG1]]
17-
; CHECK-NEXT: [[T12:%.*]] = shufflevector <4 x float> [[T8]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
18-
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[ARG]], [[ARG1]]
19-
; CHECK-NEXT: [[T16:%.*]] = shufflevector <4 x float> [[T12]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
15+
; CHECK-NEXT: [[T16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
2016
; CHECK-NEXT: ret <4 x float> [[T16]]
2117
;
2218
%t = extractelement <4 x float> %arg, i32 0
@@ -45,13 +41,9 @@ define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
4541
define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1) {
4642
; CHECK-LABEL: @add_aggregate(
4743
; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
48-
; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[A0]], [[B0]]
49-
; CHECK-NEXT: [[RETVAL_0_1_INSERT:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
50-
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
51-
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1]], [[B1]]
52-
; CHECK-NEXT: [[RETVAL_1_1_INSERT:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
53-
; CHECK-NEXT: [[FCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[RETVAL_0_1_INSERT]], 0
54-
; CHECK-NEXT: [[FCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[FCA_0_INSERT]], <2 x float> [[RETVAL_1_1_INSERT]], 1
44+
; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
45+
; CHECK-NEXT: [[FCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[TMP1]], 0
46+
; CHECK-NEXT: [[FCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[FCA_0_INSERT]], <2 x float> [[TMP2]], 1
5547
; CHECK-NEXT: ret { <2 x float>, <2 x float> } [[FCA_1_INSERT]]
5648
;
5749
%a00 = extractelement <2 x float> %a0, i32 0
@@ -81,18 +73,16 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %
8173
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
8274
; CHECK-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
8375
; CHECK-NEXT: store float [[TMP2]], float* [[R0]], align 4
84-
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[A0]], [[B0]]
85-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
76+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
8677
; CHECK-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
87-
; CHECK-NEXT: store float [[TMP4]], float* [[R1]], align 4
88-
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
89-
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
78+
; CHECK-NEXT: store float [[TMP3]], float* [[R1]], align 4
79+
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
80+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
9081
; CHECK-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
91-
; CHECK-NEXT: store float [[TMP6]], float* [[R2]], align 4
92-
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[A1]], [[B1]]
93-
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
82+
; CHECK-NEXT: store float [[TMP5]], float* [[R2]], align 4
83+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
9484
; CHECK-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
95-
; CHECK-NEXT: store float [[TMP8]], float* [[R3]], align 4
85+
; CHECK-NEXT: store float [[TMP6]], float* [[R3]], align 4
9686
; CHECK-NEXT: ret void
9787
;
9888
%a00 = extractelement <2 x float> %a0, i32 0

0 commit comments

Comments
 (0)