Skip to content

Commit 8710c48

Browse files
committed
[OpenMP][CodeGen] Improved codegen for combined loop directives
IR for 'target teams loop' is now dependent on suitability of associated loop-nest. If a loop-nest: - does not contain a function call, or - the -fopenmp-assume-no-nested-parallelism has been specified, - or the call is to an OpenMP API AND - does not contain nested loop bind(parallel) directives then it can be emitted as 'target teams distribute parallel for', which is the current default. Otherwise, it is emitted as 'target teams distribute'. Added debug output indicating how 'target teams loop' was emitted. Flag is -mllvm -debug-only=target-teams-loop-codegen Added LIT tests explicitly verifying 'target teams loop' emitted as a parallel loop and a distribute loop. Updated other 'loop' related tests as needed to reflect change in IR.
1 parent ac378ac commit 8710c48

21 files changed

+7098
-5911
lines changed

clang/lib/CodeGen/CGOpenMPRuntime.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2644,7 +2644,8 @@ void CGOpenMPRuntime::emitForStaticFinish(CodeGenFunction &CGF,
26442644
// Call __kmpc_for_static_fini(ident_t *loc, kmp_int32 tid);
26452645
llvm::Value *Args[] = {
26462646
emitUpdateLocation(CGF, Loc,
2647-
isOpenMPDistributeDirective(DKind)
2647+
isOpenMPDistributeDirective(DKind) ||
2648+
(DKind == OMPD_target_teams_loop)
26482649
? OMP_IDENT_WORK_DISTRIBUTE
26492650
: isOpenMPLoopDirective(DKind)
26502651
? OMP_IDENT_WORK_LOOP
@@ -8779,7 +8780,8 @@ getNestedDistributeDirective(ASTContext &Ctx, const OMPExecutableDirective &D) {
87798780
OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
87808781
switch (D.getDirectiveKind()) {
87818782
case OMPD_target:
8782-
// For now, just treat 'target teams loop' as if it's distributed.
8783+
// For now, treat 'target' with nested 'teams loop' as if it's
8784+
// distributed (target teams distribute).
87838785
if (isOpenMPDistributeDirective(DKind) || DKind == OMPD_teams_loop)
87848786
return NestedDir;
87858787
if (DKind == OMPD_teams) {
@@ -9263,7 +9265,8 @@ llvm::Value *CGOpenMPRuntime::emitTargetNumIterationsCall(
92639265
SizeEmitter) {
92649266
OpenMPDirectiveKind Kind = D.getDirectiveKind();
92659267
const OMPExecutableDirective *TD = &D;
9266-
// Get nested teams distribute kind directive, if any.
9268+
// Get nested teams distribute kind directive, if any. For now, treat
9269+
// 'target_teams_loop' as if it's really a target_teams_distribute.
92679270
if ((!isOpenMPDistributeDirective(Kind) || !isOpenMPTeamsDirective(Kind)) &&
92689271
Kind != OMPD_target_teams_loop)
92699272
TD = getNestedDistributeDirective(CGM.getContext(), D);

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -639,14 +639,14 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx,
639639
return false;
640640
}
641641

642-
static bool supportsSPMDExecutionMode(ASTContext &Ctx,
642+
static bool supportsSPMDExecutionMode(CodeGenModule &CGM,
643643
const OMPExecutableDirective &D) {
644+
ASTContext &Ctx = CGM.getContext();
644645
OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
645646
switch (DirectiveKind) {
646647
case OMPD_target:
647648
case OMPD_target_teams:
648649
return hasNestedSPMDDirective(Ctx, D);
649-
case OMPD_target_teams_loop:
650650
case OMPD_target_parallel_loop:
651651
case OMPD_target_parallel:
652652
case OMPD_target_parallel_for:
@@ -658,6 +658,10 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx,
658658
return true;
659659
case OMPD_target_teams_distribute:
660660
return false;
661+
case OMPD_target_teams_loop:
662+
// Whether this is true or not depends on how the directive will
663+
// eventually be emitted.
664+
return CGM.teamsLoopCanBeParallelFor(D);
661665
case OMPD_parallel:
662666
case OMPD_for:
663667
case OMPD_parallel_for:
@@ -870,7 +874,7 @@ void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
870874

871875
assert(!ParentName.empty() && "Invalid target region parent name!");
872876

873-
bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
877+
bool Mode = supportsSPMDExecutionMode(CGM, D);
874878
bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
875879
if (Mode || IsBareKernel)
876880
emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,

clang/lib/CodeGen/CGStmtOpenMP.cpp

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,14 @@
3434
#include "llvm/IR/IntrinsicInst.h"
3535
#include "llvm/IR/Metadata.h"
3636
#include "llvm/Support/AtomicOrdering.h"
37+
#include "llvm/Support/Debug.h"
3738
#include <optional>
3839
using namespace clang;
3940
using namespace CodeGen;
4041
using namespace llvm::omp;
4142

43+
#define TTL_CODEGEN_TYPE "target-teams-loop-codegen"
44+
4245
static const VarDecl *getBaseDecl(const Expr *Ref);
4346

4447
namespace {
@@ -1435,6 +1438,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
14351438
}
14361439
bool WithNowait = D.getSingleClause<OMPNowaitClause>() ||
14371440
isOpenMPParallelDirective(D.getDirectiveKind()) ||
1441+
CGM.teamsLoopCanBeParallelFor(D) ||
14381442
ReductionKind == OMPD_simd;
14391443
bool SimpleReduction = ReductionKind == OMPD_simd;
14401444
// Emit nowait reduction if nowait clause is present or directive is a
@@ -7876,11 +7880,9 @@ void CodeGenFunction::EmitOMPParallelGenericLoopDirective(
78767880
void CodeGenFunction::EmitOMPTeamsGenericLoopDirective(
78777881
const OMPTeamsGenericLoopDirective &S) {
78787882
// To be consistent with current behavior of 'target teams loop', emit
7879-
// 'teams loop' as if its constituent constructs are 'distribute,
7880-
// 'parallel, and 'for'.
7883+
// 'teams loop' as if its constituent constructs are 'teams' and 'distribute'.
78817884
auto &&CodeGenDistribute = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
7882-
CGF.EmitOMPDistributeLoop(S, emitInnerParallelForWhenCombined,
7883-
S.getDistInc());
7885+
CGF.EmitOMPDistributeLoop(S, emitOMPLoopBodyWithStopPoint, S.getInc());
78847886
};
78857887

78867888
// Emit teams region as a standalone region.
@@ -7894,15 +7896,14 @@ void CodeGenFunction::EmitOMPTeamsGenericLoopDirective(
78947896
CodeGenDistribute);
78957897
CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
78967898
};
7897-
emitCommonOMPTeamsDirective(*this, S, OMPD_distribute_parallel_for, CodeGen);
7899+
emitCommonOMPTeamsDirective(*this, S, OMPD_distribute, CodeGen);
78987900
emitPostUpdateForReductionClause(*this, S,
78997901
[](CodeGenFunction &) { return nullptr; });
79007902
}
79017903

7902-
static void
7903-
emitTargetTeamsGenericLoopRegion(CodeGenFunction &CGF,
7904-
const OMPTargetTeamsGenericLoopDirective &S,
7905-
PrePostActionTy &Action) {
7904+
static void emitTargetTeamsGenericLoopRegionAsParallel(
7905+
CodeGenFunction &CGF, PrePostActionTy &Action,
7906+
const OMPTargetTeamsGenericLoopDirective &S) {
79067907
Action.Enter(CGF);
79077908
// Emit 'teams loop' as if its constituent constructs are 'distribute,
79087909
// 'parallel, and 'for'.
@@ -7922,19 +7923,52 @@ emitTargetTeamsGenericLoopRegion(CodeGenFunction &CGF,
79227923
CGF, OMPD_distribute, CodeGenDistribute, /*HasCancel=*/false);
79237924
CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
79247925
};
7925-
7926+
DEBUG_WITH_TYPE(TTL_CODEGEN_TYPE,
7927+
CGF.CGM.emitTargetTeamsLoopCodegenStatus(
7928+
TTL_CODEGEN_TYPE " as parallel for", S,
7929+
CGF.CGM.getLangOpts().OpenMPIsTargetDevice));
79267930
emitCommonOMPTeamsDirective(CGF, S, OMPD_distribute_parallel_for,
79277931
CodeGenTeams);
79287932
emitPostUpdateForReductionClause(CGF, S,
79297933
[](CodeGenFunction &) { return nullptr; });
79307934
}
79317935

7932-
/// Emit combined directive 'target teams loop' as if its constituent
7933-
/// constructs are 'target', 'teams', 'distribute', 'parallel', and 'for'.
7936+
static void emitTargetTeamsGenericLoopRegionAsDistribute(
7937+
CodeGenFunction &CGF, PrePostActionTy &Action,
7938+
const OMPTargetTeamsGenericLoopDirective &S) {
7939+
Action.Enter(CGF);
7940+
// Emit 'teams loop' as if its constituent construct is 'distribute'.
7941+
auto &&CodeGenDistribute = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
7942+
CGF.EmitOMPDistributeLoop(S, emitOMPLoopBodyWithStopPoint, S.getInc());
7943+
};
7944+
7945+
// Emit teams region as a standalone region.
7946+
auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
7947+
PrePostActionTy &Action) {
7948+
Action.Enter(CGF);
7949+
CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
7950+
CGF.EmitOMPReductionClauseInit(S, PrivateScope);
7951+
(void)PrivateScope.Privatize();
7952+
CGF.CGM.getOpenMPRuntime().emitInlinedDirective(
7953+
CGF, OMPD_distribute, CodeGenDistribute, /*HasCancel=*/false);
7954+
CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
7955+
};
7956+
DEBUG_WITH_TYPE(TTL_CODEGEN_TYPE,
7957+
CGF.CGM.emitTargetTeamsLoopCodegenStatus(
7958+
TTL_CODEGEN_TYPE " as distribute", S,
7959+
CGF.CGM.getLangOpts().OpenMPIsTargetDevice));
7960+
emitCommonOMPTeamsDirective(CGF, S, OMPD_distribute, CodeGen);
7961+
emitPostUpdateForReductionClause(CGF, S,
7962+
[](CodeGenFunction &) { return nullptr; });
7963+
}
7964+
79347965
void CodeGenFunction::EmitOMPTargetTeamsGenericLoopDirective(
79357966
const OMPTargetTeamsGenericLoopDirective &S) {
79367967
auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
7937-
emitTargetTeamsGenericLoopRegion(CGF, S, Action);
7968+
if (CGF.CGM.teamsLoopCanBeParallelFor(S))
7969+
emitTargetTeamsGenericLoopRegionAsParallel(CGF, Action, S);
7970+
else
7971+
emitTargetTeamsGenericLoopRegionAsDistribute(CGF, Action, S);
79387972
};
79397973
emitCommonOMPTargetDirective(*this, S, CodeGen);
79407974
}
@@ -7944,7 +7978,10 @@ void CodeGenFunction::EmitOMPTargetTeamsGenericLoopDeviceFunction(
79447978
const OMPTargetTeamsGenericLoopDirective &S) {
79457979
// Emit SPMD target parallel loop region as a standalone region.
79467980
auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
7947-
emitTargetTeamsGenericLoopRegion(CGF, S, Action);
7981+
if (CGF.CGM.teamsLoopCanBeParallelFor(S))
7982+
emitTargetTeamsGenericLoopRegionAsParallel(CGF, Action, S);
7983+
else
7984+
emitTargetTeamsGenericLoopRegionAsDistribute(CGF, Action, S);
79487985
};
79497986
llvm::Function *Fn;
79507987
llvm::Constant *Addr;

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7485,6 +7485,99 @@ void CodeGenModule::printPostfixForExternalizedDecl(llvm::raw_ostream &OS,
74857485
}
74867486
}
74877487

7488+
namespace {
7489+
/// A 'teams loop' with a nested 'loop bind(parallel)' or generic function
7490+
/// call in the associated loop-nest cannot be a 'parllel for'.
7491+
class TeamsLoopChecker final : public ConstStmtVisitor<TeamsLoopChecker> {
7492+
public:
7493+
TeamsLoopChecker(CodeGenModule &CGM)
7494+
: CGM(CGM), TeamsLoopCanBeParallelFor{true} {}
7495+
bool teamsLoopCanBeParallelFor() const {
7496+
return TeamsLoopCanBeParallelFor;
7497+
}
7498+
// Is there a nested OpenMP loop bind(parallel)
7499+
void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
7500+
if (D->getDirectiveKind() == llvm::omp::Directive::OMPD_loop) {
7501+
if (const auto *C = D->getSingleClause<OMPBindClause>())
7502+
if (C->getBindKind() == OMPC_BIND_parallel) {
7503+
TeamsLoopCanBeParallelFor = false;
7504+
// No need to continue visiting any more
7505+
return;
7506+
}
7507+
}
7508+
for (const Stmt *Child : D->children())
7509+
if (Child)
7510+
Visit(Child);
7511+
}
7512+
7513+
void VisitCallExpr(const CallExpr *C) {
7514+
// Function calls inhibit parallel loop translation of 'target teams loop'
7515+
// unless the assume-no-nested-parallelism flag has been specified.
7516+
// OpenMP API runtime library calls do not inhibit parallel loop
7517+
// translation, regardless of the assume-no-nested-parallelism.
7518+
if (C) {
7519+
bool IsOpenMPAPI = false;
7520+
auto *FD = dyn_cast_or_null<FunctionDecl>(C->getCalleeDecl());
7521+
if (FD) {
7522+
std::string Name = FD->getNameInfo().getAsString();
7523+
IsOpenMPAPI = Name.find("omp_") == 0;
7524+
}
7525+
TeamsLoopCanBeParallelFor =
7526+
IsOpenMPAPI || CGM.getLangOpts().OpenMPNoNestedParallelism;
7527+
if (!TeamsLoopCanBeParallelFor)
7528+
return;
7529+
}
7530+
for (const Stmt *Child : C->children())
7531+
if (Child)
7532+
Visit(Child);
7533+
}
7534+
7535+
void VisitCapturedStmt(const CapturedStmt *S) {
7536+
if (!S)
7537+
return;
7538+
Visit(S->getCapturedDecl()->getBody());
7539+
}
7540+
7541+
void VisitStmt(const Stmt *S) {
7542+
if (!S)
7543+
return;
7544+
for (const Stmt *Child : S->children())
7545+
if (Child)
7546+
Visit(Child);
7547+
}
7548+
7549+
private:
7550+
CodeGenModule &CGM;
7551+
bool TeamsLoopCanBeParallelFor;
7552+
};
7553+
} // namespace
7554+
7555+
/// Determine if 'teams loop' can be emitted using 'parallel for'.
7556+
bool CodeGenModule::teamsLoopCanBeParallelFor(const OMPExecutableDirective &D) {
7557+
if (D.getDirectiveKind() != llvm::omp::Directive::OMPD_target_teams_loop)
7558+
return false;
7559+
assert(D.hasAssociatedStmt() &&
7560+
"Loop directive must have associated statement.");
7561+
TeamsLoopChecker Checker(*this);
7562+
Checker.Visit(D.getAssociatedStmt());
7563+
return Checker.teamsLoopCanBeParallelFor();
7564+
}
7565+
7566+
void CodeGenModule::emitTargetTeamsLoopCodegenStatus(
7567+
std::string StatusMsg, const OMPExecutableDirective &D, bool IsDevice) {
7568+
if (IsDevice)
7569+
StatusMsg += ": DEVICE";
7570+
else
7571+
StatusMsg += ": HOST";
7572+
SourceLocation L = D.getBeginLoc();
7573+
SourceManager &SM = getContext().getSourceManager();
7574+
PresumedLoc PLoc = SM.getPresumedLoc(L);
7575+
const char *FileName = PLoc.isValid() ? PLoc.getFilename() : nullptr;
7576+
unsigned LineNo =
7577+
PLoc.isValid() ? PLoc.getLine() : SM.getExpansionLineNumber(L);
7578+
llvm::dbgs() << StatusMsg << ": " << FileName << ": " << LineNo << "\n";
7579+
}
7580+
74887581
void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {
74897582
assert(DeferredDeclsToEmit.empty() &&
74907583
"Should have emitted all decls deferred to emit.");

clang/lib/CodeGen/CodeGenModule.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1528,6 +1528,8 @@ class CodeGenModule : public CodeGenTypeCache {
15281528
LValueBaseInfo *BaseInfo = nullptr,
15291529
TBAAAccessInfo *TBAAInfo = nullptr);
15301530
bool stopAutoInit();
1531+
/// Determine if 'teams loop' can be emitted using 'parallel for'.
1532+
bool teamsLoopCanBeParallelFor(const OMPExecutableDirective &D);
15311533

15321534
/// Print the postfix for externalized static variable or kernels for single
15331535
/// source offloading languages CUDA and HIP. The unique postfix is created
@@ -1537,6 +1539,12 @@ class CodeGenModule : public CodeGenTypeCache {
15371539
void printPostfixForExternalizedDecl(llvm::raw_ostream &OS,
15381540
const Decl *D) const;
15391541

1542+
/// Under debug mode, print status of target teams loop transformation,
1543+
/// which should be either '#distribute' or '#parallel for'
1544+
void emitTargetTeamsLoopCodegenStatus(std::string StatusMsg,
1545+
const OMPExecutableDirective &D,
1546+
bool IsDevice);
1547+
15401548
/// Move some lazily-emitted states to the NewBuilder. This is especially
15411549
/// essential for the incremental parsing environment like Clang Interpreter,
15421550
/// because we'll lose all important information after each repl.

clang/lib/Sema/SemaOpenMP.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4478,6 +4478,8 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
44784478
Params);
44794479
break;
44804480
}
4481+
// For 'target teams loop', collect all captured regions so codegen can
4482+
// later decide the best IR to emit given the associated loop-nest.
44814483
case OMPD_target_teams_loop:
44824484
case OMPD_target_teams_distribute_parallel_for:
44834485
case OMPD_target_teams_distribute_parallel_for_simd: {
@@ -15573,14 +15575,19 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
1557315575
if (NameModifier == OMPD_unknown || NameModifier == OMPD_parallel)
1557415576
CaptureRegion = OMPD_target;
1557515577
break;
15578+
case OMPD_teams_loop:
15579+
case OMPD_target_teams_loop:
15580+
// For [target] teams loop, assume capture region is 'teams' so it's
15581+
// available for codegen later to use if/when necessary.
15582+
CaptureRegion = OMPD_teams;
15583+
break;
1557615584
case OMPD_target_teams_distribute_parallel_for_simd:
1557715585
if (OpenMPVersion >= 50 &&
1557815586
(NameModifier == OMPD_unknown || NameModifier == OMPD_simd)) {
1557915587
CaptureRegion = OMPD_parallel;
1558015588
break;
1558115589
}
1558215590
[[fallthrough]];
15583-
case OMPD_target_teams_loop:
1558415591
case OMPD_target_teams_distribute_parallel_for:
1558515592
// If this clause applies to the nested 'parallel' region, capture within
1558615593
// the 'teams' region, otherwise do not capture.
@@ -15703,7 +15710,6 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
1570315710
case OMPD_declare_target:
1570415711
case OMPD_end_declare_target:
1570515712
case OMPD_loop:
15706-
case OMPD_teams_loop:
1570715713
case OMPD_teams:
1570815714
case OMPD_tile:
1570915715
case OMPD_unroll:

0 commit comments

Comments
 (0)