-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[OpenMP][CodeGen] Improved codegen for combined loop directives #87278
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3ba72b0
51a8fcc
b13265d
756bb47
4cd1d04
7cf47e1
4f7deb6
a5059a5
2d55723
3f1c383
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4478,6 +4478,8 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { | |
Params); | ||
break; | ||
} | ||
// For 'target teams loop', collect all captured regions so codegen can | ||
// later decide the best IR to emit given the associated loop-nest. | ||
case OMPD_target_teams_loop: | ||
case OMPD_target_teams_distribute_parallel_for: | ||
case OMPD_target_teams_distribute_parallel_for_simd: { | ||
|
@@ -6135,6 +6137,79 @@ processImplicitMapsWithDefaultMappers(Sema &S, DSAStackTy *Stack, | |
} | ||
} | ||
|
||
namespace { | ||
/// A 'teams loop' with a nested 'loop bind(parallel)' or generic function | ||
/// call in the associated loop-nest cannot be a 'parallel for'. | ||
class TeamsLoopChecker final : public ConstStmtVisitor<TeamsLoopChecker> { | ||
Sema &SemaRef; | ||
|
||
public: | ||
bool teamsLoopCanBeParallelFor() const { return TeamsLoopCanBeParallelFor; } | ||
|
||
// Is there a nested OpenMP loop bind(parallel) | ||
void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { | ||
if (D->getDirectiveKind() == llvm::omp::Directive::OMPD_loop) { | ||
if (const auto *C = D->getSingleClause<OMPBindClause>()) | ||
if (C->getBindKind() == OMPC_BIND_parallel) { | ||
TeamsLoopCanBeParallelFor = false; | ||
// No need to continue visiting any more | ||
return; | ||
} | ||
} | ||
for (const Stmt *Child : D->children()) | ||
if (Child) | ||
Visit(Child); | ||
} | ||
|
||
void VisitCallExpr(const CallExpr *C) { | ||
// Function calls inhibit parallel loop translation of 'target teams loop' | ||
// unless the assume-no-nested-parallelism flag has been specified. | ||
// OpenMP API runtime library calls do not inhibit parallel loop | ||
// translation, regardless of the assume-no-nested-parallelism. | ||
if (C) { | ||
bool IsOpenMPAPI = false; | ||
auto *FD = dyn_cast_or_null<FunctionDecl>(C->getCalleeDecl()); | ||
if (FD) { | ||
std::string Name = FD->getNameInfo().getAsString(); | ||
IsOpenMPAPI = Name.find("omp_") == 0; | ||
} | ||
TeamsLoopCanBeParallelFor = | ||
IsOpenMPAPI || SemaRef.getLangOpts().OpenMPNoNestedParallelism; | ||
if (!TeamsLoopCanBeParallelFor) | ||
return; | ||
} | ||
for (const Stmt *Child : C->children()) | ||
if (Child) | ||
Visit(Child); | ||
} | ||
Comment on lines
+6169
to
+6184
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @ddpagan, the static verifier complains about this code since you check that 'C' is not a nullptr, but then do not check before the 'C->children() call. It seems 'C' won't be null since it is checked in ActOnOpenMPTargetTeamsGenericLoopDirective. Can we remove the if(C) check and use an assert instead if you think it helps? Or if 'C' really can be null somehow we need to ensure that for loop is excluded for that case. |
||
|
||
void VisitCapturedStmt(const CapturedStmt *S) { | ||
if (!S) | ||
return; | ||
Visit(S->getCapturedDecl()->getBody()); | ||
} | ||
|
||
void VisitStmt(const Stmt *S) { | ||
if (!S) | ||
return; | ||
for (const Stmt *Child : S->children()) | ||
if (Child) | ||
Visit(Child); | ||
} | ||
explicit TeamsLoopChecker(Sema &SemaRef) | ||
: SemaRef(SemaRef), TeamsLoopCanBeParallelFor(true) {} | ||
|
||
private: | ||
bool TeamsLoopCanBeParallelFor; | ||
}; | ||
} // namespace | ||
|
||
static bool teamsLoopCanBeParallelFor(Stmt *AStmt, Sema &SemaRef) { | ||
TeamsLoopChecker Checker(SemaRef); | ||
Checker.Visit(AStmt); | ||
return Checker.teamsLoopCanBeParallelFor(); | ||
} | ||
|
||
bool Sema::mapLoopConstruct(llvm::SmallVector<OMPClause *> &ClausesWithoutBind, | ||
ArrayRef<OMPClause *> Clauses, | ||
OpenMPBindClauseKind &BindKind, | ||
|
@@ -10895,7 +10970,8 @@ StmtResult Sema::ActOnOpenMPTargetTeamsGenericLoopDirective( | |
setFunctionHasBranchProtectedScope(); | ||
|
||
return OMPTargetTeamsGenericLoopDirective::Create( | ||
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); | ||
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, | ||
teamsLoopCanBeParallelFor(AStmt, *this)); | ||
} | ||
|
||
StmtResult Sema::ActOnOpenMPParallelGenericLoopDirective( | ||
|
@@ -15645,14 +15721,19 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( | |
if (NameModifier == OMPD_unknown || NameModifier == OMPD_parallel) | ||
CaptureRegion = OMPD_target; | ||
break; | ||
case OMPD_teams_loop: | ||
case OMPD_target_teams_loop: | ||
// For [target] teams loop, assume capture region is 'teams' so it's | ||
// available for codegen later to use if/when necessary. | ||
CaptureRegion = OMPD_teams; | ||
break; | ||
case OMPD_target_teams_distribute_parallel_for_simd: | ||
if (OpenMPVersion >= 50 && | ||
(NameModifier == OMPD_unknown || NameModifier == OMPD_simd)) { | ||
CaptureRegion = OMPD_parallel; | ||
break; | ||
} | ||
[[fallthrough]]; | ||
case OMPD_target_teams_loop: | ||
case OMPD_target_teams_distribute_parallel_for: | ||
// If this clause applies to the nested 'parallel' region, capture within | ||
// the 'teams' region, otherwise do not capture. | ||
|
@@ -15775,7 +15856,6 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( | |
case OMPD_declare_target: | ||
case OMPD_end_declare_target: | ||
case OMPD_loop: | ||
case OMPD_teams_loop: | ||
case OMPD_teams: | ||
case OMPD_tile: | ||
case OMPD_unroll: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm concerned detecting CallExprs like this is going to have unexpected effects on the generated code: there are a lot of constructs that are written as "calls", but don't actually call anything external. So minor changes to the user's code could have an unexpectedly large impact on code generation.
Is there some way we can do this transform in an LLVM IR optimization pass?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry for the delayed response, Eli. I didn't get notified of your comment.
So, to answer your specific question first, yes this can be done in OpenMPOpt, and eventually it should be. But for now, this is a good compromise built on what's currently implemented.
And you are right that code generated can be different. If a call is added to an existing target-teams-loop loop nest, it will impact the code generation (target-teams-distribute versus a target-teams-distribute-parallel-for) and it will be slower, though correct. If a call is removed, then the opposite occurs. The idea is that since prior to this change, target-teams-distribute-parallel-for was always generated (which in some cases can be incorrect), using this simplistic approach allows us to catch many cases where we can still generate the faster IR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you have a plan to migrate the check to an LLVM optimization, this is okay as a temporary solution, I guess. I just don't want to end up in a situation where we try to continually refine this check instead of just moving it where it's supposed to be.
Maybe put a comment in the code describing the plan.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently, there are no plans to move this into the optimizer, though it is a step that should be taken at some point. Also, there are no plans make any more changes to the current implementation after this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just meant "plan" more loosely in the sense of how you expect the code to evolve in the future, not that you have a timeline to implement a specific change.