Skip to content

Commit 9bfe91d

Browse files
committed
[OPENMP][NVPTX]Reduce memory usage in orphaned functions.
if the function has globalized variables and called in context of target/teams/distribute regions, it does not need to globalize 32 copies of the same variables for memory coalescing, it is enough to have just one copy, because there is parallel region. Patch does this by adding call for `__kmpc_parallel_level` function and checking its return value. If the code sees that the parallel level is 0, then only one variable is allocated, not 32. llvm-svn: 344356
1 parent c046b68 commit 9bfe91d

File tree

3 files changed

+83
-12
lines changed

3 files changed

+83
-12
lines changed

clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

Lines changed: 71 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1972,6 +1972,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
19721972
return;
19731973
if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
19741974
QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
1975+
QualType SecGlobalRecTy;
19751976

19761977
// Recover pointer to this function's global record. The runtime will
19771978
// handle the specifics of the allocation of the memory.
@@ -1986,11 +1987,20 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
19861987
llvm::PointerType *GlobalRecPtrTy =
19871988
CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
19881989
llvm::Value *GlobalRecCastAddr;
1990+
llvm::Value *IsTTD = nullptr;
19891991
if (WithSPMDCheck ||
19901992
getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) {
19911993
llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
19921994
llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
19931995
llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
1996+
if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
1997+
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1998+
llvm::Value *ThreadID = getThreadID(CGF, Loc);
1999+
llvm::Value *PL = CGF.EmitRuntimeCall(
2000+
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2001+
{RTLoc, ThreadID});
2002+
IsTTD = Bld.CreateIsNull(PL);
2003+
}
19942004
llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
19952005
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
19962006
Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
@@ -2003,11 +2013,28 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
20032013
// There is no need to emit line number for unconditional branch.
20042014
(void)ApplyDebugLocation::CreateEmpty(CGF);
20052015
CGF.EmitBlock(NonSPMDBB);
2016+
llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2017+
if (const RecordDecl *SecGlobalizedVarsRecord =
2018+
I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2019+
SecGlobalRecTy =
2020+
CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2021+
2022+
// Recover pointer to this function's global record. The runtime will
2023+
// handle the specifics of the allocation of the memory.
2024+
// Use actual memory size of the record including the padding
2025+
// for alignment purposes.
2026+
unsigned Alignment =
2027+
CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2028+
unsigned GlobalRecordSize =
2029+
CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2030+
GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2031+
Size = Bld.CreateSelect(
2032+
IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2033+
}
20062034
// TODO: allow the usage of shared memory to be controlled by
20072035
// the user, for now, default to global.
20082036
llvm::Value *GlobalRecordSizeArg[] = {
2009-
llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2010-
CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2037+
Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
20112038
llvm::Value *GlobalRecValue =
20122039
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
20132040
OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
@@ -2042,6 +2069,17 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
20422069

20432070
// Emit the "global alloca" which is a GEP from the global declaration
20442071
// record using the pointer returned by the runtime.
2072+
LValue SecBase;
2073+
decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2074+
if (IsTTD) {
2075+
SecIt = I->getSecond().SecondaryLocalVarData->begin();
2076+
llvm::PointerType *SecGlobalRecPtrTy =
2077+
CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2078+
SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2079+
Bld.CreatePointerBitCastOrAddrSpaceCast(
2080+
I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2081+
SecGlobalRecTy);
2082+
}
20452083
for (auto &Rec : I->getSecond().LocalVarData) {
20462084
bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
20472085
llvm::Value *ParValue;
@@ -2055,23 +2093,32 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
20552093
// Emit VarAddr basing on lane-id if required.
20562094
QualType VarTy;
20572095
if (Rec.second.IsOnePerTeam) {
2058-
Rec.second.PrivateAddr = VarAddr.getAddress();
20592096
VarTy = Rec.second.FD->getType();
20602097
} else {
20612098
llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
20622099
VarAddr.getAddress().getPointer(),
20632100
{Bld.getInt32(0), getNVPTXLaneID(CGF)});
2064-
Rec.second.PrivateAddr =
2065-
Address(Ptr, CGM.getContext().getDeclAlign(Rec.first));
20662101
VarTy =
20672102
Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
2068-
VarAddr = CGF.MakeAddrLValue(Rec.second.PrivateAddr, VarTy,
2069-
AlignmentSource::Decl);
2103+
VarAddr = CGF.MakeAddrLValue(
2104+
Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2105+
AlignmentSource::Decl);
20702106
}
2107+
Rec.second.PrivateAddr = VarAddr.getAddress();
20712108
if (WithSPMDCheck ||
2072-
getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) {
2109+
getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) {
20732110
assert(I->getSecond().IsInSPMDModeFlag &&
20742111
"Expected unknown execution mode or required SPMD check.");
2112+
if (IsTTD) {
2113+
assert(SecIt->second.IsOnePerTeam &&
2114+
"Secondary glob data must be one per team.");
2115+
LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2116+
VarAddr.setAddress(
2117+
Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2118+
VarAddr.getPointer()),
2119+
VarAddr.getAlignment()));
2120+
Rec.second.PrivateAddr = VarAddr.getAddress();
2121+
}
20752122
Address GlobalPtr = Rec.second.PrivateAddr;
20762123
Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
20772124
Rec.second.PrivateAddr = Address(
@@ -2084,6 +2131,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
20842131
CGF.EmitStoreOfScalar(ParValue, VarAddr);
20852132
I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
20862133
}
2134+
++SecIt;
20872135
}
20882136
}
20892137
for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
@@ -4115,6 +4163,21 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
41154163
Data.insert(
41164164
std::make_pair(VD, MappedVarData(FD, IsInTargetMasterThreadRegion)));
41174165
}
4166+
if (!IsInTargetMasterThreadRegion && !NeedToDelayGlobalization &&
4167+
!IsInParallelRegion) {
4168+
CheckVarsEscapingDeclContext VarChecker(CGF);
4169+
VarChecker.Visit(Body);
4170+
I->getSecond().SecondaryGlobalRecord =
4171+
VarChecker.getGlobalizedRecord(/*IsInTargetMasterThreadRegion=*/true);
4172+
I->getSecond().SecondaryLocalVarData.emplace();
4173+
DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4174+
for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4175+
assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4176+
const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4177+
Data.insert(std::make_pair(
4178+
VD, MappedVarData(FD, /*IsInTargetMasterThreadRegion=*/true)));
4179+
}
4180+
}
41184181
if (!NeedToDelayGlobalization) {
41194182
emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
41204183
struct GlobalizationScope final : EHScopeStack::Cleanup {

clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
376376
/// The data for the single globalized variable.
377377
struct MappedVarData {
378378
/// Corresponding field in the global record.
379-
const FieldDecl * FD = nullptr;
379+
const FieldDecl *FD = nullptr;
380380
/// Corresponding address.
381381
Address PrivateAddr = Address::invalid();
382382
/// true, if only one element is required (for latprivates in SPMD mode),
@@ -392,10 +392,12 @@ class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
392392
using EscapedParamsTy = llvm::SmallPtrSet<const Decl *, 4>;
393393
struct FunctionData {
394394
DeclToAddrMapTy LocalVarData;
395+
llvm::Optional<DeclToAddrMapTy> SecondaryLocalVarData = llvm::None;
395396
EscapedParamsTy EscapedParameters;
396397
llvm::SmallVector<const ValueDecl*, 4> EscapedVariableLengthDecls;
397398
llvm::SmallVector<llvm::Value *, 4> EscapedVariableLengthDeclsAddrs;
398399
const RecordDecl *GlobalRecord = nullptr;
400+
llvm::Optional<const RecordDecl *> SecondaryGlobalRecord = llvm::None;
399401
llvm::Value *GlobalRecordAddr = nullptr;
400402
llvm::Value *IsInSPMDModeFlag = nullptr;
401403
std::unique_ptr<CodeGenFunction::OMPMapVars> MappedParams;

clang/test/OpenMP/nvptx_target_codegen.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,20 +557,26 @@ int baz(int f, double &a) {
557557
// CHECK: alloca i32,
558558
// CHECK: [[LOCAL_F_PTR:%.+]] = alloca i32,
559559
// CHECK: [[ZERO_ADDR:%.+]] = alloca i32,
560-
// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t*
561560
// CHECK: store i32 0, i32* [[ZERO_ADDR]]
561+
// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t*
562+
// CHECK: [[PAR_LEVEL:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @0, i32 [[GTID]])
563+
// CHECK: [[IS_TTD:%.+]] = icmp eq i16 %1, 0
562564
// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode()
563565
// CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0
564566
// CHECK: br i1 [[IS_SPMD]], label
565567
// CHECK: br label
566-
// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 128, i16 0)
568+
// CHECK: [[SIZE:%.+]] = select i1 [[IS_TTD]], i{{64|32}} 4, i{{64|32}} 128
569+
// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} [[SIZE]], i16 0)
567570
// CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to [[GLOBAL_ST:%.+]]*
568571
// CHECK: br label
569572
// CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ null, {{.+}} ], [ [[REC_ADDR]], {{.+}} ]
573+
// CHECK: [[TTD_ITEMS:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to [[SEC_GLOBAL_ST:%.+]]*
570574
// CHECK: [[F_PTR_ARR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i32 0, i32 0
571575
// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
572576
// CHECK: [[LID:%.+]] = and i32 [[TID]], 31
573-
// CHECK: [[GLOBAL_F_PTR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F_PTR_ARR]], i32 0, i32 [[LID]]
577+
// CHECK: [[GLOBAL_F_PTR_PAR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F_PTR_ARR]], i32 0, i32 [[LID]]
578+
// CHECK: [[GLOBAL_F_PTR_TTD:%.+]] = getelementptr inbounds [[SEC_GLOBAL_ST]], [[SEC_GLOBAL_ST]]* [[TTD_ITEMS]], i32 0, i32 0
579+
// CHECK: [[GLOBAL_F_PTR:%.+]] = select i1 [[IS_TTD]], i32* [[GLOBAL_F_PTR_TTD]], i32* [[GLOBAL_F_PTR_PAR]]
574580
// CHECK: [[F_PTR:%.+]] = select i1 [[IS_SPMD]], i32* [[LOCAL_F_PTR]], i32* [[GLOBAL_F_PTR]]
575581
// CHECK: store i32 %{{.+}}, i32* [[F_PTR]],
576582

0 commit comments

Comments
 (0)