Skip to content

Commit 9594c69

Browse files
committed
[VectorCombine] Add option to only run scalarization transforms.
This patch adds a pass option to only run transforms that scalarize vector operations and do not create new vector instructions. When running VectorCombine early in the pipeline introducing new vector operations can have negative effects, like blocking loop or SLP vectorization. To avoid regressions, restrict the early VectorCombine run (when using -enable-matrix) to only perform scalarization and not introduce new vector operations. This is done as option to the pass directly, which is then set when adding the pass to the pipeline. This is done for the new pass manager only. Reviewed By: spatel Differential Revision: https://reviews.llvm.org/D111800 (cherry-picked from 4a1d63d)
1 parent 8ffb8dc commit 9594c69

File tree

4 files changed

+36
-23
lines changed

4 files changed

+36
-23
lines changed

llvm/include/llvm/Transforms/Vectorize/VectorCombine.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,16 @@
2020
namespace llvm {
2121

2222
/// Optimize scalar/vector interactions in IR using target cost models.
23-
struct VectorCombinePass : public PassInfoMixin<VectorCombinePass> {
23+
class VectorCombinePass : public PassInfoMixin<VectorCombinePass> {
24+
/// If true only perform scalarization combines and do not introduce new
25+
/// vector operations.
26+
bool ScalarizationOnly;
27+
2428
public:
29+
VectorCombinePass(bool ScalarizationOnly = false)
30+
: ScalarizationOnly(ScalarizationOnly) {}
31+
2532
PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
2633
};
27-
2834
}
2935
#endif // LLVM_TRANSFORMS_VECTORIZE_VECTORCOMBINE_H

llvm/lib/Passes/PassBuilder.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -809,7 +809,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
809809
// The matrix extension can introduce large vector operations early, which can
810810
// benefit from running vector-combine early on.
811811
if (EnableMatrix)
812-
FPM.addPass(VectorCombinePass());
812+
FPM.addPass(VectorCombinePass(/*ScalarizationOnly=*/true));
813813

814814
// Eliminate redundancies.
815815
FPM.addPass(MergedLoadStoreMotionPass());

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,10 @@ namespace {
6363
class VectorCombine {
6464
public:
6565
VectorCombine(Function &F, const TargetTransformInfo &TTI,
66-
const DominatorTree &DT, AAResults &AA, AssumptionCache &AC)
67-
: F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC) {}
66+
const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
67+
bool ScalarizationOnly)
68+
: F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC),
69+
ScalarizationOnly(ScalarizationOnly) {}
6870

6971
bool run();
7072

@@ -75,6 +77,11 @@ class VectorCombine {
7577
const DominatorTree &DT;
7678
AAResults &AA;
7779
AssumptionCache &AC;
80+
81+
/// If true only perform scalarization combines and do not introduce new
82+
/// vector operations.
83+
bool ScalarizationOnly;
84+
7885
InstructionWorklist Worklist;
7986

8087
bool vectorizeLoadInsert(Instruction &I);
@@ -1063,11 +1070,13 @@ bool VectorCombine::run() {
10631070
bool MadeChange = false;
10641071
auto FoldInst = [this, &MadeChange](Instruction &I) {
10651072
Builder.SetInsertPoint(&I);
1066-
MadeChange |= vectorizeLoadInsert(I);
1067-
MadeChange |= foldExtractExtract(I);
1068-
MadeChange |= foldBitcastShuf(I);
1073+
if (!ScalarizationOnly) {
1074+
MadeChange |= vectorizeLoadInsert(I);
1075+
MadeChange |= foldExtractExtract(I);
1076+
MadeChange |= foldBitcastShuf(I);
1077+
MadeChange |= foldExtractedCmps(I);
1078+
}
10691079
MadeChange |= scalarizeBinopOrCmp(I);
1070-
MadeChange |= foldExtractedCmps(I);
10711080
MadeChange |= scalarizeLoadExtract(I);
10721081
MadeChange |= foldSingleElementStore(I);
10731082
};
@@ -1129,7 +1138,7 @@ class VectorCombineLegacyPass : public FunctionPass {
11291138
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
11301139
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
11311140
auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
1132-
VectorCombine Combiner(F, TTI, DT, AA, AC);
1141+
VectorCombine Combiner(F, TTI, DT, AA, AC, false);
11331142
return Combiner.run();
11341143
}
11351144
};
@@ -1153,7 +1162,7 @@ PreservedAnalyses VectorCombinePass::run(Function &F,
11531162
TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
11541163
DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
11551164
AAResults &AA = FAM.getResult<AAManager>(F);
1156-
VectorCombine Combiner(F, TTI, DT, AA, AC);
1165+
VectorCombine Combiner(F, TTI, DT, AA, AC, ScalarizationOnly);
11571166
if (!Combiner.run())
11581167
return PreservedAnalyses::all();
11591168
PreservedAnalyses PA;

llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -308,18 +308,16 @@ declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
308308

309309
define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) {
310310
; CHECK-LABEL: @reverse_hadd_v4f32(
311-
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
312-
; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
313-
; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
314-
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[SHIFT1]], [[A]]
315-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
316-
; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
317-
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
318-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 undef, i32 4, i32 2, i32 3>
319-
; CHECK-NEXT: [[SHIFT3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
320-
; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[SHIFT3]], [[B]]
321-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 6, i32 1, i32 2, i32 3>
322-
; CHECK-NEXT: ret <4 x float> [[TMP7]]
311+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 2, i32 0>
312+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 1>
313+
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
314+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
315+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <2 x i32> <i32 2, i32 0>
316+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <2 x i32> <i32 3, i32 1>
317+
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[TMP6]]
318+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
319+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
320+
; CHECK-NEXT: ret <4 x float> [[TMP9]]
323321
;
324322
%vecext = extractelement <4 x float> %a, i32 0
325323
%vecext1 = extractelement <4 x float> %a, i32 1

0 commit comments

Comments
 (0)