Skip to content

Commit ff6929e

Browse files
authored
[SYCL] Disable vectorization and loop transformation passes (#2458)
Loop unrolling in "SYCL optimization mode" uses default heuristic, which is tuned for CPU and might not be profitable for other devices.
1 parent 07b2796 commit ff6929e

File tree

3 files changed

+125
-136
lines changed

3 files changed

+125
-136
lines changed

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6634,17 +6634,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
66346634
options::OPT_fno_gnu_inline_asm, true))
66356635
CmdArgs.push_back("-fno-gnu-inline-asm");
66366636

6637-
bool EnableSYCLEarlyOptimizations =
6638-
Args.hasFlag(options::OPT_fsycl_early_optimizations,
6639-
options::OPT_fno_sycl_early_optimizations,
6640-
Triple.getSubArch() != llvm::Triple::SPIRSubArch_fpga);
6641-
66426637
// Enable vectorization per default according to the optimization level
66436638
// selected. For optimization levels that want vectorization we use the alias
66446639
// option to simplify the hasFlag logic.
66456640
bool EnableVec = shouldEnableVectorizerAtOLevel(Args, false);
6646-
if (RawTriple.isSPIR() && EnableSYCLEarlyOptimizations)
6647-
EnableVec = false; // But disable vectorization for SYCL device code
66486641
OptSpecifier VectorizeAliasOption =
66496642
EnableVec ? options::OPT_O_Group : options::OPT_fvectorize;
66506643
if (Args.hasFlag(options::OPT_fvectorize, VectorizeAliasOption,
@@ -6653,8 +6646,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
66536646

66546647
// -fslp-vectorize is enabled based on the optimization level selected.
66556648
bool EnableSLPVec = shouldEnableVectorizerAtOLevel(Args, true);
6656-
if (RawTriple.isSPIR() && EnableSYCLEarlyOptimizations)
6657-
EnableSLPVec = false; // But disable vectorization for SYCL device code
66586649
OptSpecifier SLPVectAliasOption =
66596650
EnableSLPVec ? options::OPT_O_Group : options::OPT_fslp_vectorize;
66606651
if (Args.hasFlag(options::OPT_fslp_vectorize, SLPVectAliasOption,

clang/test/Driver/sycl-device-optimizations.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,3 @@
3636
// RUN: | FileCheck -check-prefix=CHECK-DAE %s
3737
// CHECK-DAE: clang{{.*}} "-fenable-sycl-dae"
3838
// CHECK-DAE: sycl-post-link{{.*}} "-emit-param-info"
39-
40-
/// Check that vectorizers are disabled by default:
41-
// RUN: %clang -### -fsycl %s 2>&1 \
42-
// RUN: | FileCheck -check-prefix=CHECK-VEC-DEFAULT %s
43-
// CHECK-VEC-DEFAULT-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "-vectorize-loops"
44-
// CHECK-VEC-DEFAULT-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "-vectorize-slp"
45-
/// Check that vectorizers can still be enabled manually:
46-
// RUN: %clang -### -fsycl -fvectorize -fslp-vectorize %s 2>&1 \
47-
// RUN: | FileCheck -check-prefix=CHECK-VEC-ENABLE %s
48-
// CHECK-VEC-ENABLE: clang{{.*}} "-fsycl-is-device"{{.*}}"-vectorize-loops"{{.*}}"-vectorize-slp"

llvm/lib/Transforms/IPO/PassManagerBuilder.cpp

Lines changed: 125 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -429,54 +429,54 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
429429
MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
430430
MPM.add(createReassociatePass()); // Reassociate expressions
431431

432-
// Begin the loop pass pipeline.
433-
if (EnableSimpleLoopUnswitch) {
434-
// The simple loop unswitch pass relies on separate cleanup passes. Schedule
435-
// them first so when we re-process a loop they run before other loop
436-
// passes.
437-
MPM.add(createLoopInstSimplifyPass());
438-
MPM.add(createLoopSimplifyCFGPass());
439-
}
440-
// Try to remove as much code from the loop header as possible,
441-
// to reduce amount of IR that will have to be duplicated.
442-
// TODO: Investigate promotion cap for O1.
443-
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
444-
// Rotate Loop - disable header duplication at -Oz
445-
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
446-
// TODO: Investigate promotion cap for O1.
447-
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
448-
if (EnableSimpleLoopUnswitch)
449-
MPM.add(createSimpleLoopUnswitchLegacyPass());
450-
else
451-
MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
452-
// FIXME: We break the loop pass pipeline here in order to do full
453-
// simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
454-
// need for this.
455-
MPM.add(createCFGSimplificationPass());
456-
MPM.add(createInstructionCombiningPass());
457-
// We resume loop passes creating a second loop pipeline here.
458-
if (EnableLoopFlatten) {
459-
MPM.add(createLoopFlattenPass()); // Flatten loops
460-
MPM.add(createLoopSimplifyCFGPass());
432+
// Do not run loop pass pipeline in "SYCL Optimization Mode". Loop
433+
// optimizations rely on TTI, which is not accurate for SPIR target.
434+
if (!SYCLOptimizationMode) {
435+
// Begin the loop pass pipeline.
436+
if (EnableSimpleLoopUnswitch) {
437+
// The simple loop unswitch pass relies on separate cleanup passes.
438+
// Schedule them first so when we re-process a loop they run before other
439+
// loop passes.
440+
MPM.add(createLoopInstSimplifyPass());
441+
MPM.add(createLoopSimplifyCFGPass());
442+
}
443+
// Try to remove as much code from the loop header as possible,
444+
// to reduce amount of IR that will have to be duplicated.
445+
// TODO: Investigate promotion cap for O1.
446+
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
447+
// Rotate Loop - disable header duplication at -Oz
448+
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
449+
// TODO: Investigate promotion cap for O1.
450+
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
451+
if (EnableSimpleLoopUnswitch)
452+
MPM.add(createSimpleLoopUnswitchLegacyPass());
453+
else
454+
MPM.add(
455+
createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
456+
// FIXME: We break the loop pass pipeline here in order to do full
457+
// simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace
458+
// the need for this.
459+
MPM.add(createCFGSimplificationPass());
460+
MPM.add(createInstructionCombiningPass());
461+
// We resume loop passes creating a second loop pipeline here.
462+
if (EnableLoopFlatten) {
463+
MPM.add(createLoopFlattenPass()); // Flatten loops
464+
MPM.add(createLoopSimplifyCFGPass());
465+
}
466+
MPM.add(createLoopIdiomPass()); // Recognize idioms like memset.
467+
MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars
468+
addExtensionsToPM(EP_LateLoopOptimizations, MPM);
469+
MPM.add(createLoopDeletionPass()); // Delete dead loops
470+
471+
if (EnableLoopInterchange)
472+
MPM.add(createLoopInterchangePass()); // Interchange loops
473+
474+
// Unroll small loops and perform peeling.
475+
MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
476+
ForgetAllSCEVInLoopUnroll));
477+
addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
478+
// This ends the loop pass pipelines.
461479
}
462-
MPM.add(createLoopIdiomPass()); // Recognize idioms like memset.
463-
// TODO: this pass hurts performance due to promotions of induction variables
464-
// from 32-bit value to 64-bit values. I assume it's because SPIR is a virtual
465-
// target with unlimited # of registers and pass doesn't take into account
466-
// that on real HW this promotion is not beneficial.
467-
if (!SYCLOptimizationMode)
468-
MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars
469-
addExtensionsToPM(EP_LateLoopOptimizations, MPM);
470-
MPM.add(createLoopDeletionPass()); // Delete dead loops
471-
472-
if (EnableLoopInterchange)
473-
MPM.add(createLoopInterchangePass()); // Interchange loops
474-
475-
// Unroll small loops and perform peeling.
476-
MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
477-
ForgetAllSCEVInLoopUnroll));
478-
addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
479-
// This ends the loop pass pipelines.
480480

481481
// Break up allocas that may now be splittable after loop unrolling.
482482
MPM.add(createSROAPass());
@@ -788,68 +788,74 @@ void PassManagerBuilder::populateModulePassManager(
788788

789789
addExtensionsToPM(EP_VectorizerStart, MPM);
790790

791-
// Re-rotate loops in all our loop nests. These may have fallout out of
792-
// rotated form due to GVN or other transformations, and the vectorizer relies
793-
// on the rotated form. Disable header duplication at -Oz.
794-
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
795-
796-
// Distribute loops to allow partial vectorization. I.e. isolate dependences
797-
// into separate loop that would otherwise inhibit vectorization. This is
798-
// currently only performed for loops marked with the metadata
799-
// llvm.loop.distribute=true or when -enable-loop-distribute is specified.
800-
MPM.add(createLoopDistributePass());
801-
802-
MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
803-
804-
// Eliminate loads by forwarding stores from the previous iteration to loads
805-
// of the current iteration.
806-
MPM.add(createLoopLoadEliminationPass());
807-
808-
// FIXME: Because of #pragma vectorize enable, the passes below are always
809-
// inserted in the pipeline, even when the vectorizer doesn't run (ex. when
810-
// on -O1 and no #pragma is found). Would be good to have these two passes
811-
// as function calls, so that we can only pass them when the vectorizer
812-
// changed the code.
813-
MPM.add(createInstructionCombiningPass());
814-
if (OptLevel > 1 && ExtraVectorizerPasses) {
815-
// At higher optimization levels, try to clean up any runtime overlap and
816-
// alignment checks inserted by the vectorizer. We want to track correllated
817-
// runtime checks for two inner loops in the same outer loop, fold any
818-
// common computations, hoist loop-invariant aspects out of any outer loop,
819-
// and unswitch the runtime checks if possible. Once hoisted, we may have
820-
// dead (or speculatable) control flows or more combining opportunities.
821-
MPM.add(createEarlyCSEPass());
822-
MPM.add(createCorrelatedValuePropagationPass());
823-
MPM.add(createInstructionCombiningPass());
824-
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
825-
MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
826-
MPM.add(createCFGSimplificationPass());
791+
if (!SYCLOptimizationMode) {
792+
// Re-rotate loops in all our loop nests. These may have fallout out of
793+
// rotated form due to GVN or other transformations, and the vectorizer
794+
// relies on the rotated form. Disable header duplication at -Oz.
795+
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
796+
797+
// Distribute loops to allow partial vectorization. I.e. isolate
798+
// dependences into separate loop that would otherwise inhibit
799+
// vectorization. This is currently only performed for loops marked with
800+
// the metadata llvm.loop.distribute=true or when -enable-loop-distribute is
801+
// specified.
802+
MPM.add(createLoopDistributePass());
803+
804+
MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
805+
806+
// Eliminate loads by forwarding stores from the previous iteration to loads
807+
// of the current iteration.
808+
MPM.add(createLoopLoadEliminationPass());
809+
810+
// FIXME: Because of #pragma vectorize enable, the passes below are always
811+
// inserted in the pipeline, even when the vectorizer doesn't run (ex. when
812+
// on -O1 and no #pragma is found). Would be good to have these two passes
813+
// as function calls, so that we can only pass them when the vectorizer
814+
// changed the code.
827815
MPM.add(createInstructionCombiningPass());
828-
}
829-
830-
// Cleanup after loop vectorization, etc. Simplification passes like CVP and
831-
// GVN, loop transforms, and others have already run, so it's now better to
832-
// convert to more optimized IR using more aggressive simplify CFG options.
833-
// The extra sinking transform can create larger basic blocks, so do this
834-
// before SLP vectorization.
835-
// FIXME: study whether hoisting and/or sinking of common instructions should
836-
// be delayed until after SLP vectorizer.
837-
MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
838-
.forwardSwitchCondToPhi(true)
839-
.convertSwitchToLookupTable(true)
840-
.needCanonicalLoops(false)
841-
.hoistCommonInsts(true)
842-
.sinkCommonInsts(true)));
843-
844-
if (SLPVectorize) {
845-
MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
846816
if (OptLevel > 1 && ExtraVectorizerPasses) {
817+
// At higher optimization levels, try to clean up any runtime overlap and
818+
// alignment checks inserted by the vectorizer. We want to track
819+
// correllated runtime checks for two inner loops in the same outer loop,
820+
// fold any common computations, hoist loop-invariant aspects out of any
821+
// outer loop, and unswitch the runtime checks if possible. Once hoisted,
822+
// we may have dead (or speculatable) control flows or more combining
823+
// opportunities.
847824
MPM.add(createEarlyCSEPass());
825+
MPM.add(createCorrelatedValuePropagationPass());
826+
MPM.add(createInstructionCombiningPass());
827+
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
828+
MPM.add(
829+
createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
830+
MPM.add(createCFGSimplificationPass());
831+
MPM.add(createInstructionCombiningPass());
848832
}
849-
}
850833

851-
// Enhance/cleanup vector code.
852-
MPM.add(createVectorCombinePass());
834+
// Cleanup after loop vectorization, etc. Simplification passes like CVP and
835+
// GVN, loop transforms, and others have already run, so it's now better to
836+
// convert to more optimized IR using more aggressive simplify CFG options.
837+
// The extra sinking transform can create larger basic blocks, so do this
838+
// before SLP vectorization.
839+
// FIXME: study whether hoisting and/or sinking of common instructions
840+
// should
841+
// be delayed until after SLP vectorizer.
842+
MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
843+
.forwardSwitchCondToPhi(true)
844+
.convertSwitchToLookupTable(true)
845+
.needCanonicalLoops(false)
846+
.hoistCommonInsts(true)
847+
.sinkCommonInsts(true)));
848+
849+
if (SLPVectorize) {
850+
MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
851+
if (OptLevel > 1 && ExtraVectorizerPasses) {
852+
MPM.add(createEarlyCSEPass());
853+
}
854+
}
855+
856+
// Enhance/cleanup vector code.
857+
MPM.add(createVectorCombinePass());
858+
}
853859

854860
addExtensionsToPM(EP_Peephole, MPM);
855861
MPM.add(createInstructionCombiningPass());
@@ -861,22 +867,24 @@ void PassManagerBuilder::populateModulePassManager(
861867
MPM.add(createLoopUnrollAndJamPass(OptLevel));
862868
}
863869

864-
// Unroll small loops
865-
MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
866-
ForgetAllSCEVInLoopUnroll));
870+
if (!SYCLOptimizationMode) {
871+
// Unroll small loops
872+
MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
873+
ForgetAllSCEVInLoopUnroll));
867874

868-
if (!DisableUnrollLoops) {
869-
// LoopUnroll may generate some redundency to cleanup.
870-
MPM.add(createInstructionCombiningPass());
875+
if (!DisableUnrollLoops) {
876+
// LoopUnroll may generate some redundency to cleanup.
877+
MPM.add(createInstructionCombiningPass());
871878

872-
// Runtime unrolling will introduce runtime check in loop prologue. If the
873-
// unrolled loop is a inner loop, then the prologue will be inside the
874-
// outer loop. LICM pass can help to promote the runtime check out if the
875-
// checked value is loop invariant.
876-
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
877-
}
879+
// Runtime unrolling will introduce runtime check in loop prologue. If the
880+
// unrolled loop is a inner loop, then the prologue will be inside the
881+
// outer loop. LICM pass can help to promote the runtime check out if the
882+
// checked value is loop invariant.
883+
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
884+
}
878885

879-
MPM.add(createWarnMissedTransformationsPass());
886+
MPM.add(createWarnMissedTransformationsPass());
887+
}
880888

881889
// After vectorization and unrolling, assume intrinsics may tell us more
882890
// about pointer alignments.

0 commit comments

Comments
 (0)