Skip to content

Commit 3de645e

Browse files
committed
[OpenMP][NFC] Split the reduction buffer size into two components
Before we tracked the size of the teams reduction buffer in order to allocate it at runtime per kernel launch. This patch splits the number into two parts, the size of the reduction data (=all reduction variables) and the (maximal) length of the buffer. This will allow us to allocate less if we need less, e.g., if we have less teams than the maximal length. It also allows us to move code from clangs codegen into the runtime as we now know how large the reduction data is.
1 parent 921bd29 commit 3de645e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+683
-717
lines changed

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -801,10 +801,12 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
801801
llvm::Type *LLVMReductionsBufferTy =
802802
CGM.getTypes().ConvertTypeForMem(StaticTy);
803803
const auto &DL = CGM.getModule().getDataLayout();
804-
uint64_t BufferSize =
804+
uint64_t ReductionDataSize =
805805
DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
806806
CGBuilderTy &Bld = CGF.Builder;
807-
OMPBuilder.createTargetDeinit(Bld, BufferSize);
807+
OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
808+
C.getLangOpts().OpenMPCUDAReductionBufNum);
809+
TeamsReductions.clear();
808810
}
809811

810812
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -2828,15 +2830,25 @@ void CGOpenMPRuntimeGPU::emitReduction(
28282830
assert((TeamsReduction || ParallelReduction) &&
28292831
"Invalid reduction selection in emitReduction.");
28302832

2833+
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2834+
llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2835+
int Cnt = 0;
2836+
for (const Expr *DRE : Privates) {
2837+
PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
2838+
++Cnt;
2839+
}
2840+
2841+
ASTContext &C = CGM.getContext();
2842+
const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
2843+
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
2844+
28312845
// Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
28322846
// RedList, shuffle_reduce_func, interwarp_copy_func);
28332847
// or
28342848
// Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
28352849
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2836-
llvm::Value *ThreadId = getThreadID(CGF, Loc);
28372850

28382851
llvm::Value *Res;
2839-
ASTContext &C = CGM.getContext();
28402852
// 1. Build a list of reduction variables.
28412853
// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
28422854
auto Size = RHSExprs.size();
@@ -2878,19 +2890,17 @@ void CGOpenMPRuntimeGPU::emitReduction(
28782890
llvm::Function *ReductionFn = emitReductionFunction(
28792891
CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
28802892
Privates, LHSExprs, RHSExprs, ReductionOps);
2881-
llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2893+
llvm::Value *ReductionDataSize =
2894+
CGF.getTypeSize(C.getRecordType(ReductionRec));
2895+
ReductionDataSize =
2896+
CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
28822897
llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
28832898
CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
28842899
llvm::Value *InterWarpCopyFn =
28852900
emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
28862901

28872902
if (ParallelReduction) {
2888-
llvm::Value *Args[] = {RTLoc,
2889-
ThreadId,
2890-
CGF.Builder.getInt32(RHSExprs.size()),
2891-
ReductionArrayTySize,
2892-
RL,
2893-
ShuffleAndReduceFn,
2903+
llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
28942904
InterWarpCopyFn};
28952905

28962906
Res = CGF.EmitRuntimeCall(
@@ -2899,37 +2909,27 @@ void CGOpenMPRuntimeGPU::emitReduction(
28992909
Args);
29002910
} else {
29012911
assert(TeamsReduction && "expected teams reduction.");
2902-
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2903-
llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2904-
int Cnt = 0;
2905-
for (const Expr *DRE : Privates) {
2906-
PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
2907-
++Cnt;
2908-
}
2909-
const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
2910-
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
2911-
C.getLangOpts().OpenMPCUDAReductionBufNum);
2912-
TeamsReductions.push_back(TeamReductionRec);
2912+
TeamsReductions.push_back(ReductionRec);
29132913
auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
29142914
OMPBuilder.getOrCreateRuntimeFunction(
29152915
CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
29162916
{}, "_openmp_teams_reductions_buffer_$_$ptr");
29172917
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2918-
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2918+
CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
29192919
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2920-
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2920+
CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
29212921
ReductionFn);
29222922
llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2923-
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2923+
CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
29242924
llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2925-
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2925+
CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
29262926
ReductionFn);
29272927

29282928
llvm::Value *Args[] = {
29292929
RTLoc,
2930-
ThreadId,
29312930
KernelTeamsReductionPtr,
29322931
CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
2932+
ReductionDataSize,
29332933
RL,
29342934
ShuffleAndReduceFn,
29352935
InterWarpCopyFn,

0 commit comments

Comments
 (0)