Skip to content

Commit f9a89e6

Browse files
authored
[OpenMP][FIX] Allocate per launch memory for GPU team reductions (#70752)
We used to perform team reduction on global memory allocated in the runtime and by clang. This was racy as multiple instances of a kernel, or different kernels with team reductions, would use the same locations. Since we now have the kernel launch environment, we can allocate dynamic memory per-launch, allowing us to move all the state into a non-racy place. Fixes: #70249
1 parent 0d3377c commit f9a89e6

File tree

9 files changed

+231
-195
lines changed

9 files changed

+231
-195
lines changed

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 28 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -803,8 +803,30 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
803803
if (!IsSPMD)
804804
emitGenericVarsEpilog(CGF);
805805

806+
// This is temporary until we remove the fixed sized buffer.
807+
ASTContext &C = CGM.getContext();
808+
RecordDecl *StaticRD = C.buildImplicitRecord(
809+
"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
810+
StaticRD->startDefinition();
811+
for (const RecordDecl *TeamReductionRec : TeamsReductions) {
812+
QualType RecTy = C.getRecordType(TeamReductionRec);
813+
auto *Field = FieldDecl::Create(
814+
C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
815+
C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
816+
/*BW=*/nullptr, /*Mutable=*/false,
817+
/*InitStyle=*/ICIS_NoInit);
818+
Field->setAccess(AS_public);
819+
StaticRD->addDecl(Field);
820+
}
821+
StaticRD->completeDefinition();
822+
QualType StaticTy = C.getRecordType(StaticRD);
823+
llvm::Type *LLVMReductionsBufferTy =
824+
CGM.getTypes().ConvertTypeForMem(StaticTy);
825+
const auto &DL = CGM.getModule().getDataLayout();
826+
uint64_t BufferSize =
827+
DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
806828
CGBuilderTy &Bld = CGF.Builder;
807-
OMPBuilder.createTargetDeinit(Bld);
829+
OMPBuilder.createTargetDeinit(Bld, BufferSize);
808830
}
809831

810832
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -2998,15 +3020,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
29983020
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
29993021
C.getLangOpts().OpenMPCUDAReductionBufNum);
30003022
TeamsReductions.push_back(TeamReductionRec);
3001-
if (!KernelTeamsReductionPtr) {
3002-
KernelTeamsReductionPtr = new llvm::GlobalVariable(
3003-
CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3004-
llvm::GlobalValue::InternalLinkage, nullptr,
3005-
"_openmp_teams_reductions_buffer_$_$ptr");
3006-
}
3007-
llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3008-
Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
3009-
/*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3023+
auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
3024+
OMPBuilder.getOrCreateRuntimeFunction(
3025+
CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
3026+
{}, "_openmp_teams_reductions_buffer_$_$ptr");
30103027
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
30113028
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
30123029
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
@@ -3021,7 +3038,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
30213038
llvm::Value *Args[] = {
30223039
RTLoc,
30233040
ThreadId,
3024-
GlobalBufferPtr,
3041+
KernelTeamsReductionPtr,
30253042
CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
30263043
RL,
30273044
ShuffleAndReduceFn,
@@ -3654,42 +3671,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
36543671
CGOpenMPRuntime::processRequiresDirective(D);
36553672
}
36563673

3657-
void CGOpenMPRuntimeGPU::clear() {
3658-
3659-
if (!TeamsReductions.empty()) {
3660-
ASTContext &C = CGM.getContext();
3661-
RecordDecl *StaticRD = C.buildImplicitRecord(
3662-
"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3663-
StaticRD->startDefinition();
3664-
for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3665-
QualType RecTy = C.getRecordType(TeamReductionRec);
3666-
auto *Field = FieldDecl::Create(
3667-
C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3668-
C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3669-
/*BW=*/nullptr, /*Mutable=*/false,
3670-
/*InitStyle=*/ICIS_NoInit);
3671-
Field->setAccess(AS_public);
3672-
StaticRD->addDecl(Field);
3673-
}
3674-
StaticRD->completeDefinition();
3675-
QualType StaticTy = C.getRecordType(StaticRD);
3676-
llvm::Type *LLVMReductionsBufferTy =
3677-
CGM.getTypes().ConvertTypeForMem(StaticTy);
3678-
// FIXME: nvlink does not handle weak linkage correctly (object with the
3679-
// different size are reported as erroneous).
3680-
// Restore CommonLinkage as soon as nvlink is fixed.
3681-
auto *GV = new llvm::GlobalVariable(
3682-
CGM.getModule(), LLVMReductionsBufferTy,
3683-
/*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3684-
llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3685-
"_openmp_teams_reductions_buffer_$_");
3686-
KernelTeamsReductionPtr->setInitializer(
3687-
llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3688-
CGM.VoidPtrTy));
3689-
}
3690-
CGOpenMPRuntime::clear();
3691-
}
3692-
36933674
llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
36943675
CGBuilderTy &Bld = CGF.Builder;
36953676
llvm::Module *M = &CGF.CGM.getModule();

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
130130

131131
public:
132132
explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
133-
void clear() override;
134133

135134
bool isGPU() const override { return true; };
136135

@@ -386,7 +385,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
386385
/// Maps the function to the list of the globalized variables with their
387386
/// addresses.
388387
llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
389-
llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
390388
/// List of the records with the list of fields for the reductions across the
391389
/// teams. Used to build the intermediate buffer for the fast teams
392390
/// reductions.

0 commit comments

Comments
 (0)