Skip to content

Commit 22c95ef

Browse files
committed
[SYCL][NewPM] Disable vectorization and loop transformation
Port changes that disable vectorization and loop transformation passes in optimization pipeline from legacy Pass Manager builder to new PM builder (see commit ff6929e). Also for those tests that fail without this fix after optimization by the new Pass Manager add validation for both legacy and new PMs. Signed-off-by: Mikhail Lychkov <[email protected]>
1 parent c18f43c commit 22c95ef

File tree

5 files changed

+218
-177
lines changed

5 files changed

+218
-177
lines changed

llvm/lib/Passes/PassBuilderPipelines.cpp

Lines changed: 193 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,8 @@ extern cl::opt<bool> EnableMatrix;
223223

224224
extern cl::opt<bool> DisablePreInliner;
225225
extern cl::opt<int> PreInlineThreshold;
226+
227+
extern cl::opt<bool> SYCLOptimizationMode;
226228
} // namespace llvm
227229

228230
void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM,
@@ -271,78 +273,88 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
271273
// Form canonically associated expression trees, and simplify the trees using
272274
// basic mathematical properties. For example, this will form (nearly)
273275
// minimal multiplication trees.
274-
FPM.addPass(ReassociatePass());
275-
276-
// Add the primary loop simplification pipeline.
277-
// FIXME: Currently this is split into two loop pass pipelines because we run
278-
// some function passes in between them. These can and should be removed
279-
// and/or replaced by scheduling the loop pass equivalents in the correct
280-
// positions. But those equivalent passes aren't powerful enough yet.
281-
// Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
282-
// used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
283-
// fully replace `SimplifyCFGPass`, and the closest to the other we have is
284-
// `LoopInstSimplify`.
285-
LoopPassManager LPM1, LPM2;
286-
287-
// Simplify the loop body. We do this initially to clean up after other loop
288-
// passes run, either when iterating on a loop or on inner loops with
289-
// implications on the outer loop.
290-
LPM1.addPass(LoopInstSimplifyPass());
291-
LPM1.addPass(LoopSimplifyCFGPass());
292-
293-
// Try to remove as much code from the loop header as possible,
294-
// to reduce amount of IR that will have to be duplicated.
295-
// TODO: Investigate promotion cap for O1.
296-
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
297-
298-
LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true,
299-
isLTOPreLink(Phase)));
300-
// TODO: Investigate promotion cap for O1.
301-
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
302-
LPM1.addPass(SimpleLoopUnswitchPass());
303-
304-
LPM2.addPass(LoopIdiomRecognizePass());
305-
LPM2.addPass(IndVarSimplifyPass());
306-
307-
for (auto &C : LateLoopOptimizationsEPCallbacks)
308-
C(LPM2, Level);
309-
310-
LPM2.addPass(LoopDeletionPass());
311-
312-
if (EnableLoopInterchange)
313-
LPM2.addPass(LoopInterchangePass());
314-
315-
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
316-
// because it changes IR to makes profile annotation in back compile
317-
// inaccurate. The normal unroller doesn't pay attention to forced full unroll
318-
// attributes so we need to make sure and allow the full unroll pass to pay
319-
// attention to it.
320-
if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
321-
PGOOpt->Action != PGOOptions::SampleUse)
322-
LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
323-
/* OnlyWhenForced= */ !PTO.LoopUnrolling,
324-
PTO.ForgetAllSCEVInLoopUnroll));
325-
326-
for (auto &C : LoopOptimizerEndEPCallbacks)
327-
C(LPM2, Level);
328-
329-
// We provide the opt remark emitter pass for LICM to use. We only need to do
330-
// this once as it is immutable.
331-
FPM.addPass(
332-
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
333-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
334-
/*UseMemorySSA=*/true,
335-
/*UseBlockFrequencyInfo=*/true));
336-
FPM.addPass(SimplifyCFGPass());
337-
FPM.addPass(InstCombinePass());
338-
if (EnableLoopFlatten)
339-
FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
340-
// The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
341-
// *All* loop passes must preserve it, in order to be able to use it.
342-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
343-
/*UseMemorySSA=*/false,
344-
/*UseBlockFrequencyInfo=*/false));
276+
if (!SYCLOptimizationMode) {
277+
// FIXME: re-association increases variables liveness and therefore register
278+
// pressure.
279+
FPM.addPass(ReassociatePass());
280+
281+
// Do not run loop pass pipeline in "SYCL Optimization Mode". Loop
282+
// optimizations rely on TTI, which is not accurate for SPIR target.
283+
284+
// Add the primary loop simplification pipeline.
285+
// FIXME: Currently this is split into two loop pass pipelines because we
286+
// run some function passes in between them. These can and should be removed
287+
// and/or replaced by scheduling the loop pass equivalents in the correct
288+
// positions. But those equivalent passes aren't powerful enough yet.
289+
// Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
290+
// used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet
291+
// to fully replace `SimplifyCFGPass`, and the closest to the other we have
292+
// is `LoopInstSimplify`.
293+
LoopPassManager LPM1, LPM2;
294+
295+
// Simplify the loop body. We do this initially to clean up after other loop
296+
// passes run, either when iterating on a loop or on inner loops with
297+
// implications on the outer loop.
298+
LPM1.addPass(LoopInstSimplifyPass());
299+
LPM1.addPass(LoopSimplifyCFGPass());
300+
301+
// Try to remove as much code from the loop header as possible,
302+
// to reduce amount of IR that will have to be duplicated.
303+
// TODO: Investigate promotion cap for O1.
304+
LPM1.addPass(
305+
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
306+
307+
LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true,
308+
isLTOPreLink(Phase)));
309+
// TODO: Investigate promotion cap for O1.
310+
LPM1.addPass(
311+
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
312+
LPM1.addPass(SimpleLoopUnswitchPass());
313+
314+
LPM2.addPass(LoopIdiomRecognizePass());
315+
LPM2.addPass(IndVarSimplifyPass());
316+
317+
for (auto &C : LateLoopOptimizationsEPCallbacks)
318+
C(LPM2, Level);
345319

320+
LPM2.addPass(LoopDeletionPass());
321+
322+
if (EnableLoopInterchange)
323+
LPM2.addPass(LoopInterchangePass());
324+
325+
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
326+
// because it changes IR to makes profile annotation in back compile
327+
// inaccurate. The normal unroller doesn't pay attention to forced full
328+
// unroll attributes so we need to make sure and allow the full unroll pass
329+
// to pay attention to it.
330+
if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
331+
PGOOpt->Action != PGOOptions::SampleUse)
332+
LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
333+
/* OnlyWhenForced= */ !PTO.LoopUnrolling,
334+
PTO.ForgetAllSCEVInLoopUnroll));
335+
336+
for (auto &C : LoopOptimizerEndEPCallbacks)
337+
C(LPM2, Level);
338+
339+
// We provide the opt remark emitter pass for LICM to use. We only need to
340+
// do this once as it is immutable.
341+
FPM.addPass(
342+
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
343+
FPM.addPass(
344+
createFunctionToLoopPassAdaptor(std::move(LPM1),
345+
/*UseMemorySSA=*/true,
346+
/*UseBlockFrequencyInfo=*/true));
347+
FPM.addPass(SimplifyCFGPass());
348+
FPM.addPass(InstCombinePass());
349+
if (EnableLoopFlatten)
350+
FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
351+
// The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
352+
// *All* loop passes must preserve it, in order to be able to use it.
353+
FPM.addPass(
354+
createFunctionToLoopPassAdaptor(std::move(LPM2),
355+
/*UseMemorySSA=*/false,
356+
/*UseBlockFrequencyInfo=*/false));
357+
}
346358
// Delete small array after loop unroll.
347359
FPM.addPass(SROAPass());
348360

@@ -443,80 +455,91 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
443455
// Form canonically associated expression trees, and simplify the trees using
444456
// basic mathematical properties. For example, this will form (nearly)
445457
// minimal multiplication trees.
446-
FPM.addPass(ReassociatePass());
447-
448-
// Add the primary loop simplification pipeline.
449-
// FIXME: Currently this is split into two loop pass pipelines because we run
450-
// some function passes in between them. These can and should be removed
451-
// and/or replaced by scheduling the loop pass equivalents in the correct
452-
// positions. But those equivalent passes aren't powerful enough yet.
453-
// Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
454-
// used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
455-
// fully replace `SimplifyCFGPass`, and the closest to the other we have is
456-
// `LoopInstSimplify`.
457-
LoopPassManager LPM1, LPM2;
458-
459-
// Simplify the loop body. We do this initially to clean up after other loop
460-
// passes run, either when iterating on a loop or on inner loops with
461-
// implications on the outer loop.
462-
LPM1.addPass(LoopInstSimplifyPass());
463-
LPM1.addPass(LoopSimplifyCFGPass());
464-
465-
// Try to remove as much code from the loop header as possible,
466-
// to reduce amount of IR that will have to be duplicated.
467-
// TODO: Investigate promotion cap for O1.
468-
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
458+
if (!SYCLOptimizationMode) {
459+
// FIXME: re-association increases variables liveness and therefore register
460+
// pressure.
461+
FPM.addPass(ReassociatePass());
462+
463+
// Do not run loop pass pipeline in "SYCL Optimization Mode". Loop
464+
// optimizations rely on TTI, which is not accurate for SPIR target.
465+
466+
// Add the primary loop simplification pipeline.
467+
// FIXME: Currently this is split into two loop pass pipelines because we
468+
// run some function passes in between them. These can and should be removed
469+
// and/or replaced by scheduling the loop pass equivalents in the correct
470+
// positions. But those equivalent passes aren't powerful enough yet.
471+
// Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
472+
// used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet
473+
// to fully replace `SimplifyCFGPass`, and the closest to the other we have
474+
// is `LoopInstSimplify`.
475+
LoopPassManager LPM1, LPM2;
476+
477+
// Simplify the loop body. We do this initially to clean up after other loop
478+
// passes run, either when iterating on a loop or on inner loops with
479+
// implications on the outer loop.
480+
LPM1.addPass(LoopInstSimplifyPass());
481+
LPM1.addPass(LoopSimplifyCFGPass());
482+
483+
// Try to remove as much code from the loop header as possible,
484+
// to reduce amount of IR that will have to be duplicated.
485+
// TODO: Investigate promotion cap for O1.
486+
LPM1.addPass(
487+
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
488+
489+
// Disable header duplication in loop rotation at -Oz.
490+
LPM1.addPass(
491+
LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase)));
492+
// TODO: Investigate promotion cap for O1.
493+
LPM1.addPass(
494+
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
495+
LPM1.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
496+
OptimizationLevel::O3 &&
497+
EnableO3NonTrivialUnswitching));
498+
LPM2.addPass(LoopIdiomRecognizePass());
499+
LPM2.addPass(IndVarSimplifyPass());
469500

470-
// Disable header duplication in loop rotation at -Oz.
471-
LPM1.addPass(
472-
LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase)));
473-
// TODO: Investigate promotion cap for O1.
474-
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
475-
LPM1.addPass(
476-
SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 &&
477-
EnableO3NonTrivialUnswitching));
478-
LPM2.addPass(LoopIdiomRecognizePass());
479-
LPM2.addPass(IndVarSimplifyPass());
480-
481-
for (auto &C : LateLoopOptimizationsEPCallbacks)
482-
C(LPM2, Level);
483-
484-
LPM2.addPass(LoopDeletionPass());
485-
486-
if (EnableLoopInterchange)
487-
LPM2.addPass(LoopInterchangePass());
488-
489-
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
490-
// because it changes IR to makes profile annotation in back compile
491-
// inaccurate. The normal unroller doesn't pay attention to forced full unroll
492-
// attributes so we need to make sure and allow the full unroll pass to pay
493-
// attention to it.
494-
if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
495-
PGOOpt->Action != PGOOptions::SampleUse)
496-
LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
497-
/* OnlyWhenForced= */ !PTO.LoopUnrolling,
498-
PTO.ForgetAllSCEVInLoopUnroll));
499-
500-
for (auto &C : LoopOptimizerEndEPCallbacks)
501-
C(LPM2, Level);
502-
503-
// We provide the opt remark emitter pass for LICM to use. We only need to do
504-
// this once as it is immutable.
505-
FPM.addPass(
506-
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
507-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
508-
/*UseMemorySSA=*/true,
509-
/*UseBlockFrequencyInfo=*/true));
510-
FPM.addPass(SimplifyCFGPass());
511-
FPM.addPass(InstCombinePass());
512-
if (EnableLoopFlatten)
513-
FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
514-
// The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
515-
// LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
516-
// *All* loop passes must preserve it, in order to be able to use it.
517-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
518-
/*UseMemorySSA=*/false,
519-
/*UseBlockFrequencyInfo=*/false));
501+
for (auto &C : LateLoopOptimizationsEPCallbacks)
502+
C(LPM2, Level);
503+
504+
LPM2.addPass(LoopDeletionPass());
505+
506+
if (EnableLoopInterchange)
507+
LPM2.addPass(LoopInterchangePass());
508+
509+
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
510+
// because it changes IR to makes profile annotation in back compile
511+
// inaccurate. The normal unroller doesn't pay attention to forced full
512+
// unroll attributes so we need to make sure and allow the full unroll pass
513+
// to pay attention to it.
514+
if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
515+
PGOOpt->Action != PGOOptions::SampleUse)
516+
LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
517+
/* OnlyWhenForced= */ !PTO.LoopUnrolling,
518+
PTO.ForgetAllSCEVInLoopUnroll));
519+
520+
for (auto &C : LoopOptimizerEndEPCallbacks)
521+
C(LPM2, Level);
522+
523+
// We provide the opt remark emitter pass for LICM to use. We only need to
524+
// do this once as it is immutable.
525+
FPM.addPass(
526+
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
527+
FPM.addPass(
528+
createFunctionToLoopPassAdaptor(std::move(LPM1),
529+
/*UseMemorySSA=*/true,
530+
/*UseBlockFrequencyInfo=*/true));
531+
FPM.addPass(SimplifyCFGPass());
532+
FPM.addPass(InstCombinePass());
533+
if (EnableLoopFlatten)
534+
FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
535+
// The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
536+
// LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
537+
// *All* loop passes must preserve it, in order to be able to use it.
538+
FPM.addPass(
539+
createFunctionToLoopPassAdaptor(std::move(LPM2),
540+
/*UseMemorySSA=*/false,
541+
/*UseBlockFrequencyInfo=*/false));
542+
}
520543

521544
// Delete small array after loop unroll.
522545
FPM.addPass(SROAPass());
@@ -1161,29 +1184,32 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
11611184
for (auto &C : VectorizerStartEPCallbacks)
11621185
C(OptimizePM, Level);
11631186

1164-
LoopPassManager LPM;
1165-
// First rotate loops that may have been un-rotated by prior passes.
1166-
// Disable header duplication at -Oz.
1167-
LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink));
1168-
// Some loops may have become dead by now. Try to delete them.
1169-
// FIXME: see discussion in https://reviews.llvm.org/D112851,
1170-
// this may need to be revisited once we run GVN before loop deletion
1171-
// in the simplification pipeline.
1172-
LPM.addPass(LoopDeletionPass());
1173-
OptimizePM.addPass(createFunctionToLoopPassAdaptor(
1174-
std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
1175-
1176-
// Distribute loops to allow partial vectorization. I.e. isolate dependences
1177-
// into separate loop that would otherwise inhibit vectorization. This is
1178-
// currently only performed for loops marked with the metadata
1179-
// llvm.loop.distribute=true or when -enable-loop-distribute is specified.
1180-
OptimizePM.addPass(LoopDistributePass());
1181-
1182-
// Populates the VFABI attribute with the scalar-to-vector mappings
1183-
// from the TargetLibraryInfo.
1184-
OptimizePM.addPass(InjectTLIMappings());
1185-
1186-
addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
1187+
if (!SYCLOptimizationMode) {
1188+
LoopPassManager LPM;
1189+
// First rotate loops that may have been un-rotated by prior passes.
1190+
// Disable header duplication at -Oz.
1191+
LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink));
1192+
// Some loops may have become dead by now. Try to delete them.
1193+
// FIXME: see disscussion in https://reviews.llvm.org/D112851,
1194+
// this may need to be revisited once we run GVN before loop deletion
1195+
// in the simplification pipeline.
1196+
LPM.addPass(LoopDeletionPass());
1197+
OptimizePM.addPass(
1198+
createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false,
1199+
/*UseBlockFrequencyInfo=*/false));
1200+
1201+
// Distribute loops to allow partial vectorization. I.e. isolate dependences
1202+
// into separate loop that would otherwise inhibit vectorization. This is
1203+
// currently only performed for loops marked with the metadata
1204+
// llvm.loop.distribute=true or when -enable-loop-distribute is specified.
1205+
OptimizePM.addPass(LoopDistributePass());
1206+
1207+
// Populates the VFABI attribute with the scalar-to-vector mappings
1208+
// from the TargetLibraryInfo.
1209+
OptimizePM.addPass(InjectTLIMappings());
1210+
1211+
addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
1212+
}
11871213

11881214
// LoopSink pass sinks instructions hoisted by LICM, which serves as a
11891215
// canonicalization pass that enables other optimizations. As a result,

0 commit comments

Comments
 (0)