Skip to content

Commit 163bb6d

Browse files
committed
[Passes][VectorCombine] enable early run generally and try load folds
An early run of VectorCombine was added with D102496 specifically to deal with unnecessary vector ops produced with the C matrix extension. This patch is proposing to try those folds in general and add a pair of load folds to the menu. The load transform will partly solve (see PhaseOrdering diffs) a longstanding vectorization perf bug by removing redundant loads via GVN: issue #17113 The main reason for not enabling the extra pass generally in the initial patch was compile-time cost. The cost of VectorCombine was significantly (surprisingly) improved with: 87debda https://llvm-compile-time-tracker.com/compare.php?from=ffe05b8f57d97bc4340f791cb386c8d00e0739f2&to=87debdadaf18f8a5c7e5d563889e10731dc3554d&stat=instructions:u ...so the extra run is going to cost very little now - the total cost of the 2 runs should be less than the 1 run before that micro-optimization: https://llvm-compile-time-tracker.com/compare.php?from=5e8c2026d10e8e2c93c038c776853bed0e7c8fc1&to=2c4b68eab5ae969811f422714e0eba44c5f7eefb&stat=instructions:u It may be possible to reduce the cost slightly more with a few more earlier-exits like that, but it's probably in the noise based on timing experiments. Differential Revision: https://reviews.llvm.org/D138353
1 parent 8f337f8 commit 163bb6d

9 files changed

+15
-18
lines changed

llvm/lib/Passes/PassBuilderPipelines.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -615,10 +615,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
615615
// Delete small array after loop unroll.
616616
FPM.addPass(SROAPass());
617617

618-
// The matrix extension can introduce large vector operations early, which can
619-
// benefit from running vector-combine early on.
620-
if (EnableMatrix)
621-
FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true));
618+
// Try vectorization/scalarization transforms that are both improvements
619+
// themselves and can allow further folds with GVN and InstCombine.
620+
FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true));
622621

623622
// Eliminate redundancies.
624623
FPM.addPass(MergedLoadStoreMotionPass());

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1700,8 +1700,6 @@ bool VectorCombine::run() {
17001700
Builder.SetInsertPoint(&I);
17011701
if (!TryEarlyFoldsOnly) {
17021702
if (isa<FixedVectorType>(I.getType())) {
1703-
MadeChange |= vectorizeLoadInsert(I);
1704-
MadeChange |= widenSubvectorLoad(I);
17051703
MadeChange |= foldInsExtFNeg(I);
17061704
MadeChange |= foldBitcastShuf(I);
17071705
MadeChange |= foldShuffleOfBinops(I);
@@ -1713,6 +1711,8 @@ bool VectorCombine::run() {
17131711
}
17141712
}
17151713
if (isa<FixedVectorType>(I.getType())) {
1714+
MadeChange |= vectorizeLoadInsert(I);
1715+
MadeChange |= widenSubvectorLoad(I);
17161716
MadeChange |= scalarizeBinopOrCmp(I);
17171717
MadeChange |= scalarizeLoadExtract(I);
17181718
}

llvm/test/Other/new-pm-defaults.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@
185185
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
186186
; CHECK-EP-LOOP-END-NEXT: Running pass: NoOpLoopPass
187187
; CHECK-O-NEXT: Running pass: SROAPass on foo
188-
; CHECK-MATRIX: Running pass: VectorCombinePass
188+
; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
189189
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
190190
; CHECK-O23SZ-NEXT: Running pass: GVNPass
191191
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

llvm/test/Other/new-pm-thinlto-defaults.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@
158158
; CHECK-O-NEXT: Running pass: LoopDeletionPass
159159
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
160160
; CHECK-O-NEXT: Running pass: SROAPass on foo
161+
; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
161162
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
162163
; CHECK-O23SZ-NEXT: Running pass: GVNPass
163164
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@
119119
; CHECK-O-NEXT: Running pass: LoopDeletionPass
120120
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
121121
; CHECK-O-NEXT: Running pass: SROAPass on foo
122+
; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
122123
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
123124
; CHECK-O23SZ-NEXT: Running pass: GVNPass
124125
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@
128128
; CHECK-O-NEXT: Running pass: LoopDeletionPass
129129
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
130130
; CHECK-O-NEXT: Running pass: SROAPass on foo
131+
; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
131132
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
132133
; CHECK-O23SZ-NEXT: Running pass: GVNPass
133134
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@
157157
; CHECK-O-NEXT: Running pass: LoopDeletionPass
158158
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
159159
; CHECK-O-NEXT: Running pass: SROAPass on foo
160+
; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
160161
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
161162
; CHECK-O23SZ-NEXT: Running pass: GVNPass
162163
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@
122122
; CHECK-O-NEXT: Running pass: IndVarSimplifyPass
123123
; CHECK-O-NEXT: Running pass: LoopDeletionPass
124124
; CHECK-O-NEXT: Running pass: SROAPass on foo
125+
; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
125126
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
126127
; CHECK-O23SZ-NEXT: Running pass: GVNPass
127128
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,13 @@ $getAt = comdat any
1212
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
1313
; SSE-LABEL: @ConvertVectors_ByRef(
1414
; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
15-
; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP0]], i64 0, i64 1
16-
; SSE-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
17-
; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
18-
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
19-
; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
20-
; SSE-NEXT: ret <4 x float> [[TMP7]]
15+
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
16+
; SSE-NEXT: ret <4 x float> [[TMP3]]
2117
;
2218
; AVX-LABEL: @ConvertVectors_ByRef(
2319
; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
24-
; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP0]], i64 0, i64 2
25-
; AVX-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 8
26-
; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i64 2
27-
; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP4]], i64 3
28-
; AVX-NEXT: ret <4 x float> [[TMP6]]
20+
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
21+
; AVX-NEXT: ret <4 x float> [[TMP3]]
2922
;
3023
%2 = alloca ptr, align 8
3124
%3 = alloca <4 x float>, align 16

0 commit comments

Comments
 (0)