Skip to content

Commit 375d213

Browse files
authored
[SYCL][NewPM] Disable vectorization and loop transformation (#5305)
Port changes that disable vectorization and loop transformation passes in optimization pipeline from legacy Pass Manager builder to new PM builder (see commit ff6929e). Also for those tests that fail without this fix after optimization by the new Pass Manager add validation for both legacy and new PMs. Signed-off-by: Mikhail Lychkov <[email protected]>
1 parent 8250a51 commit 375d213

File tree

5 files changed

+220
-178
lines changed

5 files changed

+220
-178
lines changed

llvm/lib/Passes/PassBuilderPipelines.cpp

Lines changed: 194 additions & 168 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,8 @@ extern cl::opt<bool> EnableMatrix;
223223

224224
extern cl::opt<bool> DisablePreInliner;
225225
extern cl::opt<int> PreInlineThreshold;
226+
227+
extern cl::opt<bool> SYCLOptimizationMode;
226228
} // namespace llvm
227229

228230
void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM,
@@ -271,78 +273,88 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
271273
// Form canonically associated expression trees, and simplify the trees using
272274
// basic mathematical properties. For example, this will form (nearly)
273275
// minimal multiplication trees.
274-
FPM.addPass(ReassociatePass());
275-
276-
// Add the primary loop simplification pipeline.
277-
// FIXME: Currently this is split into two loop pass pipelines because we run
278-
// some function passes in between them. These can and should be removed
279-
// and/or replaced by scheduling the loop pass equivalents in the correct
280-
// positions. But those equivalent passes aren't powerful enough yet.
281-
// Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
282-
// used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
283-
// fully replace `SimplifyCFGPass`, and the closest to the other we have is
284-
// `LoopInstSimplify`.
285-
LoopPassManager LPM1, LPM2;
286-
287-
// Simplify the loop body. We do this initially to clean up after other loop
288-
// passes run, either when iterating on a loop or on inner loops with
289-
// implications on the outer loop.
290-
LPM1.addPass(LoopInstSimplifyPass());
291-
LPM1.addPass(LoopSimplifyCFGPass());
292-
293-
// Try to remove as much code from the loop header as possible,
294-
// to reduce amount of IR that will have to be duplicated.
295-
// TODO: Investigate promotion cap for O1.
296-
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
297-
298-
LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true,
299-
isLTOPreLink(Phase)));
300-
// TODO: Investigate promotion cap for O1.
301-
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
302-
LPM1.addPass(SimpleLoopUnswitchPass());
303-
if (EnableLoopFlatten)
304-
LPM1.addPass(LoopFlattenPass());
305-
306-
LPM2.addPass(LoopIdiomRecognizePass());
307-
LPM2.addPass(IndVarSimplifyPass());
308-
309-
for (auto &C : LateLoopOptimizationsEPCallbacks)
310-
C(LPM2, Level);
311-
312-
LPM2.addPass(LoopDeletionPass());
313-
314-
if (EnableLoopInterchange)
315-
LPM2.addPass(LoopInterchangePass());
316-
317-
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
318-
// because it changes IR to makes profile annotation in back compile
319-
// inaccurate. The normal unroller doesn't pay attention to forced full unroll
320-
// attributes so we need to make sure and allow the full unroll pass to pay
321-
// attention to it.
322-
if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
323-
PGOOpt->Action != PGOOptions::SampleUse)
324-
LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
325-
/* OnlyWhenForced= */ !PTO.LoopUnrolling,
326-
PTO.ForgetAllSCEVInLoopUnroll));
327-
328-
for (auto &C : LoopOptimizerEndEPCallbacks)
329-
C(LPM2, Level);
330-
331-
// We provide the opt remark emitter pass for LICM to use. We only need to do
332-
// this once as it is immutable.
333-
FPM.addPass(
334-
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
335-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
336-
/*UseMemorySSA=*/true,
337-
/*UseBlockFrequencyInfo=*/true));
338-
FPM.addPass(SimplifyCFGPass());
339-
FPM.addPass(InstCombinePass());
340-
// The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
341-
// *All* loop passes must preserve it, in order to be able to use it.
342-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
343-
/*UseMemorySSA=*/false,
344-
/*UseBlockFrequencyInfo=*/false));
276+
if (!SYCLOptimizationMode) {
277+
// FIXME: re-association increases variables liveness and therefore register
278+
// pressure.
279+
FPM.addPass(ReassociatePass());
280+
281+
// Do not run loop pass pipeline in "SYCL Optimization Mode". Loop
282+
// optimizations rely on TTI, which is not accurate for SPIR target.
283+
284+
// Add the primary loop simplification pipeline.
285+
// FIXME: Currently this is split into two loop pass pipelines because we
286+
// run some function passes in between them. These can and should be removed
287+
// and/or replaced by scheduling the loop pass equivalents in the correct
288+
// positions. But those equivalent passes aren't powerful enough yet.
289+
// Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
290+
// used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet
291+
// to fully replace `SimplifyCFGPass`, and the closest to the other we have
292+
// is `LoopInstSimplify`.
293+
LoopPassManager LPM1, LPM2;
294+
295+
// Simplify the loop body. We do this initially to clean up after other loop
296+
// passes run, either when iterating on a loop or on inner loops with
297+
// implications on the outer loop.
298+
LPM1.addPass(LoopInstSimplifyPass());
299+
LPM1.addPass(LoopSimplifyCFGPass());
300+
301+
// Try to remove as much code from the loop header as possible,
302+
// to reduce amount of IR that will have to be duplicated.
303+
// TODO: Investigate promotion cap for O1.
304+
LPM1.addPass(
305+
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
306+
307+
LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true,
308+
isLTOPreLink(Phase)));
309+
// TODO: Investigate promotion cap for O1.
310+
LPM1.addPass(
311+
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
312+
LPM1.addPass(SimpleLoopUnswitchPass());
313+
if (EnableLoopFlatten)
314+
LPM1.addPass(LoopFlattenPass());
315+
316+
LPM2.addPass(LoopIdiomRecognizePass());
317+
LPM2.addPass(IndVarSimplifyPass());
318+
319+
for (auto &C : LateLoopOptimizationsEPCallbacks)
320+
C(LPM2, Level);
345321

322+
LPM2.addPass(LoopDeletionPass());
323+
324+
if (EnableLoopInterchange)
325+
LPM2.addPass(LoopInterchangePass());
326+
327+
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
328+
// because it changes IR to makes profile annotation in back compile
329+
// inaccurate. The normal unroller doesn't pay attention to forced full
330+
// unroll attributes so we need to make sure and allow the full unroll pass
331+
// to pay attention to it.
332+
if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
333+
PGOOpt->Action != PGOOptions::SampleUse)
334+
LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
335+
/* OnlyWhenForced= */ !PTO.LoopUnrolling,
336+
PTO.ForgetAllSCEVInLoopUnroll));
337+
338+
for (auto &C : LoopOptimizerEndEPCallbacks)
339+
C(LPM2, Level);
340+
341+
// We provide the opt remark emitter pass for LICM to use. We only need to
342+
// do this once as it is immutable.
343+
FPM.addPass(
344+
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
345+
FPM.addPass(
346+
createFunctionToLoopPassAdaptor(std::move(LPM1),
347+
/*UseMemorySSA=*/true,
348+
/*UseBlockFrequencyInfo=*/true));
349+
FPM.addPass(SimplifyCFGPass());
350+
FPM.addPass(InstCombinePass());
351+
// The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
352+
// *All* loop passes must preserve it, in order to be able to use it.
353+
FPM.addPass(
354+
createFunctionToLoopPassAdaptor(std::move(LPM2),
355+
/*UseMemorySSA=*/false,
356+
/*UseBlockFrequencyInfo=*/false));
357+
}
346358
// Delete small array after loop unroll.
347359
FPM.addPass(SROAPass());
348360

@@ -443,81 +455,92 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
443455
// Form canonically associated expression trees, and simplify the trees using
444456
// basic mathematical properties. For example, this will form (nearly)
445457
// minimal multiplication trees.
446-
FPM.addPass(ReassociatePass());
447-
448-
// Add the primary loop simplification pipeline.
449-
// FIXME: Currently this is split into two loop pass pipelines because we run
450-
// some function passes in between them. These can and should be removed
451-
// and/or replaced by scheduling the loop pass equivalents in the correct
452-
// positions. But those equivalent passes aren't powerful enough yet.
453-
// Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
454-
// used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
455-
// fully replace `SimplifyCFGPass`, and the closest to the other we have is
456-
// `LoopInstSimplify`.
457-
LoopPassManager LPM1, LPM2;
458-
459-
// Simplify the loop body. We do this initially to clean up after other loop
460-
// passes run, either when iterating on a loop or on inner loops with
461-
// implications on the outer loop.
462-
LPM1.addPass(LoopInstSimplifyPass());
463-
LPM1.addPass(LoopSimplifyCFGPass());
464-
465-
// Try to remove as much code from the loop header as possible,
466-
// to reduce amount of IR that will have to be duplicated.
467-
// TODO: Investigate promotion cap for O1.
468-
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
458+
if (!SYCLOptimizationMode) {
459+
// FIXME: re-association increases variables liveness and therefore register
460+
// pressure.
461+
FPM.addPass(ReassociatePass());
462+
463+
// Do not run loop pass pipeline in "SYCL Optimization Mode". Loop
464+
// optimizations rely on TTI, which is not accurate for SPIR target.
465+
466+
// Add the primary loop simplification pipeline.
467+
// FIXME: Currently this is split into two loop pass pipelines because we
468+
// run some function passes in between them. These can and should be removed
469+
// and/or replaced by scheduling the loop pass equivalents in the correct
470+
// positions. But those equivalent passes aren't powerful enough yet.
471+
// Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
472+
// used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet
473+
// to fully replace `SimplifyCFGPass`, and the closest to the other we have
474+
// is `LoopInstSimplify`.
475+
LoopPassManager LPM1, LPM2;
476+
477+
// Simplify the loop body. We do this initially to clean up after other loop
478+
// passes run, either when iterating on a loop or on inner loops with
479+
// implications on the outer loop.
480+
LPM1.addPass(LoopInstSimplifyPass());
481+
LPM1.addPass(LoopSimplifyCFGPass());
482+
483+
// Try to remove as much code from the loop header as possible,
484+
// to reduce amount of IR that will have to be duplicated.
485+
// TODO: Investigate promotion cap for O1.
486+
LPM1.addPass(
487+
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
488+
489+
// Disable header duplication in loop rotation at -Oz.
490+
LPM1.addPass(
491+
LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase)));
492+
// TODO: Investigate promotion cap for O1.
493+
LPM1.addPass(
494+
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
495+
LPM1.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
496+
OptimizationLevel::O3 &&
497+
EnableO3NonTrivialUnswitching));
498+
if (EnableLoopFlatten)
499+
LPM1.addPass(LoopFlattenPass());
500+
501+
LPM2.addPass(LoopIdiomRecognizePass());
502+
LPM2.addPass(IndVarSimplifyPass());
469503

470-
// Disable header duplication in loop rotation at -Oz.
471-
LPM1.addPass(
472-
LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase)));
473-
// TODO: Investigate promotion cap for O1.
474-
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
475-
LPM1.addPass(
476-
SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 &&
477-
EnableO3NonTrivialUnswitching));
478-
if (EnableLoopFlatten)
479-
LPM1.addPass(LoopFlattenPass());
480-
481-
LPM2.addPass(LoopIdiomRecognizePass());
482-
LPM2.addPass(IndVarSimplifyPass());
483-
484-
for (auto &C : LateLoopOptimizationsEPCallbacks)
485-
C(LPM2, Level);
486-
487-
LPM2.addPass(LoopDeletionPass());
488-
489-
if (EnableLoopInterchange)
490-
LPM2.addPass(LoopInterchangePass());
491-
492-
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
493-
// because it changes IR to makes profile annotation in back compile
494-
// inaccurate. The normal unroller doesn't pay attention to forced full unroll
495-
// attributes so we need to make sure and allow the full unroll pass to pay
496-
// attention to it.
497-
if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
498-
PGOOpt->Action != PGOOptions::SampleUse)
499-
LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
500-
/* OnlyWhenForced= */ !PTO.LoopUnrolling,
501-
PTO.ForgetAllSCEVInLoopUnroll));
502-
503-
for (auto &C : LoopOptimizerEndEPCallbacks)
504-
C(LPM2, Level);
505-
506-
// We provide the opt remark emitter pass for LICM to use. We only need to do
507-
// this once as it is immutable.
508-
FPM.addPass(
509-
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
510-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
511-
/*UseMemorySSA=*/true,
512-
/*UseBlockFrequencyInfo=*/true));
513-
FPM.addPass(SimplifyCFGPass());
514-
FPM.addPass(InstCombinePass());
515-
// The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
516-
// LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
517-
// *All* loop passes must preserve it, in order to be able to use it.
518-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
519-
/*UseMemorySSA=*/false,
520-
/*UseBlockFrequencyInfo=*/false));
504+
for (auto &C : LateLoopOptimizationsEPCallbacks)
505+
C(LPM2, Level);
506+
507+
LPM2.addPass(LoopDeletionPass());
508+
509+
if (EnableLoopInterchange)
510+
LPM2.addPass(LoopInterchangePass());
511+
512+
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
513+
// because it changes IR to makes profile annotation in back compile
514+
// inaccurate. The normal unroller doesn't pay attention to forced full
515+
// unroll attributes so we need to make sure and allow the full unroll pass
516+
// to pay attention to it.
517+
if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
518+
PGOOpt->Action != PGOOptions::SampleUse)
519+
LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
520+
/* OnlyWhenForced= */ !PTO.LoopUnrolling,
521+
PTO.ForgetAllSCEVInLoopUnroll));
522+
523+
for (auto &C : LoopOptimizerEndEPCallbacks)
524+
C(LPM2, Level);
525+
526+
// We provide the opt remark emitter pass for LICM to use. We only need to
527+
// do this once as it is immutable.
528+
FPM.addPass(
529+
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
530+
FPM.addPass(
531+
createFunctionToLoopPassAdaptor(std::move(LPM1),
532+
/*UseMemorySSA=*/true,
533+
/*UseBlockFrequencyInfo=*/true));
534+
FPM.addPass(SimplifyCFGPass());
535+
FPM.addPass(InstCombinePass());
536+
// The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
537+
// LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
538+
// *All* loop passes must preserve it, in order to be able to use it.
539+
FPM.addPass(
540+
createFunctionToLoopPassAdaptor(std::move(LPM2),
541+
/*UseMemorySSA=*/false,
542+
/*UseBlockFrequencyInfo=*/false));
543+
}
521544

522545
// Delete small array after loop unroll.
523546
FPM.addPass(SROAPass());
@@ -1162,29 +1185,32 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
11621185
for (auto &C : VectorizerStartEPCallbacks)
11631186
C(OptimizePM, Level);
11641187

1165-
LoopPassManager LPM;
1166-
// First rotate loops that may have been un-rotated by prior passes.
1167-
// Disable header duplication at -Oz.
1168-
LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink));
1169-
// Some loops may have become dead by now. Try to delete them.
1170-
// FIXME: see discussion in https://reviews.llvm.org/D112851,
1171-
// this may need to be revisited once we run GVN before loop deletion
1172-
// in the simplification pipeline.
1173-
LPM.addPass(LoopDeletionPass());
1174-
OptimizePM.addPass(createFunctionToLoopPassAdaptor(
1175-
std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
1176-
1177-
// Distribute loops to allow partial vectorization. I.e. isolate dependences
1178-
// into separate loop that would otherwise inhibit vectorization. This is
1179-
// currently only performed for loops marked with the metadata
1180-
// llvm.loop.distribute=true or when -enable-loop-distribute is specified.
1181-
OptimizePM.addPass(LoopDistributePass());
1182-
1183-
// Populates the VFABI attribute with the scalar-to-vector mappings
1184-
// from the TargetLibraryInfo.
1185-
OptimizePM.addPass(InjectTLIMappings());
1186-
1187-
addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
1188+
if (!SYCLOptimizationMode) {
1189+
LoopPassManager LPM;
1190+
// First rotate loops that may have been un-rotated by prior passes.
1191+
// Disable header duplication at -Oz.
1192+
LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink));
1193+
// Some loops may have become dead by now. Try to delete them.
1194+
// FIXME: see discussion in https://reviews.llvm.org/D112851,
1195+
// this may need to be revisited once we run GVN before loop deletion
1196+
// in the simplification pipeline.
1197+
LPM.addPass(LoopDeletionPass());
1198+
OptimizePM.addPass(
1199+
createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false,
1200+
/*UseBlockFrequencyInfo=*/false));
1201+
1202+
// Distribute loops to allow partial vectorization. I.e. isolate dependences
1203+
// into separate loop that would otherwise inhibit vectorization. This is
1204+
// currently only performed for loops marked with the metadata
1205+
// llvm.loop.distribute=true or when -enable-loop-distribute is specified.
1206+
OptimizePM.addPass(LoopDistributePass());
1207+
1208+
// Populates the VFABI attribute with the scalar-to-vector mappings
1209+
// from the TargetLibraryInfo.
1210+
OptimizePM.addPass(InjectTLIMappings());
1211+
1212+
addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
1213+
}
11881214

11891215
// LoopSink pass sinks instructions hoisted by LICM, which serves as a
11901216
// canonicalization pass that enables other optimizations. As a result,

0 commit comments

Comments
 (0)