Skip to content

Commit 6fa65f8

Browse files
committed
[Polly][MatMul] Abandon dependence analysis.
The copy statements inserted by the matrix-multiplication optimization introduce new dependencies between the copy statements and other statements. As a result, the DependenceInfo must be recomputed. Not recomputing them caused IslAstInfo to deduce that some loops are parallel but cause race conditions when accessing the packed arrays. As a result, matrix-matrix multiplication currently cannot be parallelized. Also see discussion at https://reviews.llvm.org/D125202
1 parent 3944780 commit 6fa65f8

File tree

6 files changed

+63
-49
lines changed

6 files changed

+63
-49
lines changed

polly/include/polly/DependenceInfo.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,15 @@ struct DependenceAnalysis final : public AnalysisInfoMixin<DependenceAnalysis> {
208208

209209
/// Recompute dependences from schedule and memory accesses.
210210
const Dependences &recomputeDependences(Dependences::AnalysisLevel Level);
211+
212+
/// Invalidate the dependence information and recompute it when needed
213+
/// again.
214+
/// May be required when the underlaying Scop was changed in a way that
215+
/// would add new dependencies (e.g. between new statement instances
216+
/// insierted into the SCoP) or intentionally breaks existing ones. It is
217+
/// not required when updating the schedule that conforms the existing
218+
/// dependencies.
219+
void abandonDependences();
211220
};
212221
Result run(Scop &S, ScopAnalysisManager &SAM,
213222
ScopStandardAnalysisResults &SAR);
@@ -241,6 +250,13 @@ class DependenceInfo final : public ScopPass {
241250
/// Recompute dependences from schedule and memory accesses.
242251
const Dependences &recomputeDependences(Dependences::AnalysisLevel Level);
243252

253+
/// Invalidate the dependence information and recompute it when needed again.
254+
/// May be required when the underlaying Scop was changed in a way that would
255+
/// add new dependencies (e.g. between new statement instances insierted into
256+
/// the SCoP) or intentionally breaks existing ones. It is not required when
257+
/// updating the schedule that conforms the existing dependencies.
258+
void abandonDependences();
259+
244260
/// Compute the dependence information for the SCoP @p S.
245261
bool runOnScop(Scop &S) override;
246262

polly/lib/Analysis/DependenceInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -848,6 +848,11 @@ const Dependences &DependenceAnalysis::Result::recomputeDependences(
848848
return *D[Level];
849849
}
850850

851+
void DependenceAnalysis::Result::abandonDependences() {
852+
for (std::unique_ptr<Dependences> &Deps : D)
853+
Deps.release();
854+
}
855+
851856
DependenceAnalysis::Result
852857
DependenceAnalysis::run(Scop &S, ScopAnalysisManager &SAM,
853858
ScopStandardAnalysisResults &SAR) {
@@ -890,6 +895,11 @@ DependenceInfo::recomputeDependences(Dependences::AnalysisLevel Level) {
890895
return *D[Level];
891896
}
892897

898+
void DependenceInfo::abandonDependences() {
899+
for (std::unique_ptr<Dependences> &Deps : D)
900+
Deps.release();
901+
}
902+
893903
bool DependenceInfo::runOnScop(Scop &ScopVar) {
894904
S = &ScopVar;
895905
return false;

polly/lib/Transform/MatmulOptimizer.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -491,9 +491,6 @@ createMacroKernel(isl::schedule_node Node,
491491
Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
492492
Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
493493

494-
// Mark the outermost loop as parallelizable.
495-
Node = Node.as<isl::schedule_node_band>().member_set_coincident(0, true);
496-
497494
return Node.child(0).child(0);
498495
}
499496

polly/lib/Transform/ScheduleOptimizer.cpp

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ struct OptimizerAdditionalInfoTy {
228228
bool PatternOpts;
229229
bool Postopts;
230230
bool Prevect;
231+
bool &DepsChanged;
231232
};
232233

233234
class ScheduleTreeOptimizer final {
@@ -526,6 +527,7 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
526527
tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
527528
if (!PatternOptimizedSchedule.is_null()) {
528529
MatMulOpts++;
530+
OAI->DepsChanged = true;
529531
return PatternOptimizedSchedule.release();
530532
}
531533
}
@@ -676,21 +678,21 @@ static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
676678
&Version);
677679
}
678680

679-
static bool runIslScheduleOptimizer(
681+
static void runIslScheduleOptimizer(
680682
Scop &S,
681683
function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
682684
TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
683-
isl::schedule &LastSchedule) {
685+
isl::schedule &LastSchedule, bool &DepsChanged) {
684686

685687
// Skip SCoPs in case they're already optimised by PPCGCodeGeneration
686688
if (S.isToBeSkipped())
687-
return false;
689+
return;
688690

689691
// Skip empty SCoPs but still allow code generation as it will delete the
690692
// loops present but not needed.
691693
if (S.getSize() == 0) {
692694
S.markAsOptimized();
693-
return false;
695+
return;
694696
}
695697

696698
ScopsProcessed++;
@@ -706,7 +708,7 @@ static bool runIslScheduleOptimizer(
706708
&S, Schedule, GetDeps(Dependences::AL_Statement), ORE);
707709
if (ManuallyTransformed.is_null()) {
708710
LLVM_DEBUG(dbgs() << "Error during manual optimization\n");
709-
return false;
711+
return;
710712
}
711713

712714
if (ManuallyTransformed.get() != Schedule.get()) {
@@ -724,18 +726,18 @@ static bool runIslScheduleOptimizer(
724726
// metadata earlier in ScopDetection.
725727
if (!HasUserTransformation && S.hasDisableHeuristicsHint()) {
726728
LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n");
727-
return false;
729+
return;
728730
}
729731

730732
// Get dependency analysis.
731733
const Dependences &D = GetDeps(Dependences::AL_Statement);
732734
if (D.getSharedIslCtx() != S.getSharedIslCtx()) {
733735
LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
734-
return false;
736+
return;
735737
}
736738
if (!D.hasValidDependences()) {
737739
LLVM_DEBUG(dbgs() << "Dependency information not available\n");
738-
return false;
740+
return;
739741
}
740742

741743
// Apply ISL's algorithm only if not overriden by the user. Note that
@@ -769,7 +771,7 @@ static bool runIslScheduleOptimizer(
769771
isl::union_set Domain = S.getDomains();
770772

771773
if (Domain.is_null())
772-
return false;
774+
return;
773775

774776
isl::union_map Validity = D.getDependences(ValidityKinds);
775777
isl::union_map Proximity = D.getDependences(ProximityKinds);
@@ -847,7 +849,7 @@ static bool runIslScheduleOptimizer(
847849
// In cases the scheduler is not able to optimize the code, we just do not
848850
// touch the schedule.
849851
if (Schedule.is_null())
850-
return false;
852+
return;
851853

852854
if (GreedyFusion) {
853855
isl::union_map Validity = D.getDependences(
@@ -858,10 +860,12 @@ static bool runIslScheduleOptimizer(
858860

859861
// Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
860862
const OptimizerAdditionalInfoTy OAI = {
861-
TTI, const_cast<Dependences *>(&D),
863+
TTI,
864+
const_cast<Dependences *>(&D),
862865
/*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
863866
/*Postopts=*/!HasUserTransformation && EnablePostopts,
864-
/*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE};
867+
/*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE,
868+
DepsChanged};
865869
if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
866870
Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
867871
Schedule = hoistExtensionNodes(Schedule);
@@ -872,7 +876,7 @@ static bool runIslScheduleOptimizer(
872876
// Skip profitability check if user transformation(s) have been applied.
873877
if (!HasUserTransformation &&
874878
!ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
875-
return false;
879+
return;
876880

877881
auto ScopStats = S.getStatistics();
878882
ScopsOptimized++;
@@ -885,8 +889,6 @@ static bool runIslScheduleOptimizer(
885889

886890
if (OptimizedScops)
887891
errs() << S;
888-
889-
return false;
890892
}
891893

892894
bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
@@ -904,7 +906,13 @@ bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
904906
getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
905907
TargetTransformInfo *TTI =
906908
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
907-
return runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule);
909+
910+
bool DepsChanged = false;
911+
runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule,
912+
DepsChanged);
913+
if (DepsChanged)
914+
getAnalysis<DependenceInfo>().abandonDependences();
915+
return false;
908916
}
909917

910918
static void runScheduleOptimizerPrinter(raw_ostream &OS,
@@ -971,22 +979,18 @@ runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
971979
OptimizationRemarkEmitter ORE(&S.getFunction());
972980
TargetTransformInfo *TTI = &SAR.TTI;
973981
isl::schedule LastSchedule;
974-
bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule);
982+
bool DepsChanged = false;
983+
runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged);
984+
if (DepsChanged)
985+
Deps.abandonDependences();
986+
975987
if (OS) {
976988
*OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
977989
<< S.getName() << "' in function '" << S.getFunction().getName()
978990
<< "':\n";
979991
runScheduleOptimizerPrinter(*OS, LastSchedule);
980992
}
981-
982-
if (!Modified)
983-
return PreservedAnalyses::all();
984-
985-
PreservedAnalyses PA;
986-
PA.preserveSet<AllAnalysesOn<Module>>();
987-
PA.preserveSet<AllAnalysesOn<Function>>();
988-
PA.preserveSet<AllAnalysesOn<Loop>>();
989-
return PA;
993+
return PreservedAnalyses::all();
990994
}
991995

992996
llvm::PreservedAnalyses

polly/test/CodeGen/OpenMP/matmul-parallel.ll

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,10 @@
22
; RUN: opt %loadPolly -polly-parallel -polly-opt-isl -polly-codegen -S < %s | FileCheck --check-prefix=CODEGEN %s
33
; REQUIRES: asserts
44

5-
; Parellization of detected matrix-multiplication. The allocations
6-
; Packed_A and Packed_B must be passed to the outlined function.
7-
; llvm.org/PR43164
8-
;
9-
; #define N 1536
10-
; int foo(float A[N][N],float B[N][N],float C[N][N]) {
11-
; for (int i = 0; i < N; i++) {
12-
; for (int j = 0; j < N; j++) {
13-
; for (int k = 0; k < N; k++)
14-
; C[i][j] = C[i][j] + A[i][k] * B[k][j];
15-
; }
16-
; }
17-
; return 0;
18-
; }
5+
; Parallelization of detected matrix-multiplication.
6+
; Currently, this is not supported. Due to Packed_A/Packed_B not private
7+
; per-thread the outer loops cannot be parallelized and a
8+
; '#pragma omp parallel for' on an inner loop may impose too much overhead.
199

2010
target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
2111
target triple = "x86_64-pc-windows-msvc19.16.27034"
@@ -65,8 +55,6 @@ for.body8:
6555
}
6656

6757

68-
; AST: #pragma omp parallel for
58+
; AST-NOT: parallel
6959

70-
; CODGEN-LABEL: define internal void @init_array_polly_subfn(i8* %polly.par.userContext)
71-
; CODEGEN: %polly.subfunc.arg.Packed_A = load
72-
; CODEGEN: %polly.subfunc.arg.Packed_B = load
60+
; CODEGEN-NOT: subfunc

polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@
1515
;
1616
; CHECK-NOT: The matrix multiplication pattern was detected
1717
; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected
18-
; PARALLEL-AST: #pragma known-parallel
19-
; PARALLEL-AST: #pragma known-parallel
18+
; PARALLEL-AST-NOT: #pragma known-parallel
2019
; STATS: 1 polly-opt-isl - Number of matrix multiplication patterns detected and optimized
2120
;
2221
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

0 commit comments

Comments
 (0)