Skip to content

Commit a4a5bbf

Browse files
jdoerfertronlieb
authored andcommitted
[OpenMP][NFC] Split the reduction buffer size into two components
Before we tracked the size of the teams reduction buffer in order to allocate it at runtime per kernel launch. This patch splits the number into two parts, the size of the reduction data (=all reduction variables) and the (maximal) length of the buffer. This will allow us to allocate less if we need less, e.g., if we have less teams than the maximal length. It also allows us to move code from clangs codegen into the runtime as we now know how large the reduction data is. Change-Id: I936fd6a99109fac7c6a452dd79de9834e837bcca
1 parent bac3286 commit a4a5bbf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+777
-719
lines changed

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -921,10 +921,12 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
921921
llvm::Type *LLVMReductionsBufferTy =
922922
CGM.getTypes().ConvertTypeForMem(StaticTy);
923923
const auto &DL = CGM.getModule().getDataLayout();
924-
uint64_t BufferSize =
924+
uint64_t ReductionDataSize =
925925
DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
926926
CGBuilderTy &Bld = CGF.Builder;
927-
OMPBuilder.createTargetDeinit(Bld, BufferSize);
927+
OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
928+
C.getLangOpts().OpenMPCUDAReductionBufNum);
929+
TeamsReductions.clear();
928930
}
929931

930932
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -3023,15 +3025,25 @@ void CGOpenMPRuntimeGPU::emitReduction(
30233025
assert((TeamsReduction || ParallelReduction) &&
30243026
"Invalid reduction selection in emitReduction.");
30253027

3028+
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3029+
llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3030+
int Cnt = 0;
3031+
for (const Expr *DRE : Privates) {
3032+
PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3033+
++Cnt;
3034+
}
3035+
3036+
ASTContext &C = CGM.getContext();
3037+
const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
3038+
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
3039+
30263040
// Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
30273041
// RedList, shuffle_reduce_func, interwarp_copy_func);
30283042
// or
30293043
// Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
30303044
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3031-
llvm::Value *ThreadId = getThreadID(CGF, Loc);
30323045

30333046
llvm::Value *Res;
3034-
ASTContext &C = CGM.getContext();
30353047
// 1. Build a list of reduction variables.
30363048
// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
30373049
auto Size = RHSExprs.size();
@@ -3073,19 +3085,17 @@ void CGOpenMPRuntimeGPU::emitReduction(
30733085
llvm::Function *ReductionFn = emitReductionFunction(
30743086
CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
30753087
Privates, LHSExprs, RHSExprs, ReductionOps);
3076-
llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3088+
llvm::Value *ReductionDataSize =
3089+
CGF.getTypeSize(C.getRecordType(ReductionRec));
3090+
ReductionDataSize =
3091+
CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
30773092
llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
30783093
CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
30793094
llvm::Value *InterWarpCopyFn =
30803095
emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
30813096

30823097
if (ParallelReduction) {
3083-
llvm::Value *Args[] = {RTLoc,
3084-
ThreadId,
3085-
CGF.Builder.getInt32(RHSExprs.size()),
3086-
ReductionArrayTySize,
3087-
RL,
3088-
ShuffleAndReduceFn,
3098+
llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
30893099
InterWarpCopyFn};
30903100

30913101
Res = CGF.EmitRuntimeCall(
@@ -3094,37 +3104,27 @@ void CGOpenMPRuntimeGPU::emitReduction(
30943104
Args);
30953105
} else {
30963106
assert(TeamsReduction && "expected teams reduction.");
3097-
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3098-
llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3099-
int Cnt = 0;
3100-
for (const Expr *DRE : Privates) {
3101-
PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3102-
++Cnt;
3103-
}
3104-
const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3105-
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
3106-
C.getLangOpts().OpenMPCUDAReductionBufNum);
3107-
TeamsReductions.push_back(TeamReductionRec);
3107+
TeamsReductions.push_back(ReductionRec);
31083108
auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
31093109
OMPBuilder.getOrCreateRuntimeFunction(
31103110
CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
31113111
{}, "_openmp_teams_reductions_buffer_$_$ptr");
31123112
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3113-
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3113+
CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
31143114
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3115-
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3115+
CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
31163116
ReductionFn);
31173117
llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3118-
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3118+
CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
31193119
llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3120-
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3120+
CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
31213121
ReductionFn);
31223122

31233123
llvm::Value *Args[] = {
31243124
RTLoc,
3125-
ThreadId,
31263125
KernelTeamsReductionPtr,
31273126
CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3127+
ReductionDataSize,
31283128
RL,
31293129
ShuffleAndReduceFn,
31303130
InterWarpCopyFn,

0 commit comments

Comments
 (0)