Skip to content

Commit 1859bd4

Browse files
committed
[OpenMP][FIX] Allocate per launch memory for GPU team reductions
We used to perform team reduction on global memory allocated in the runtime and by clang. This was racy as multiple instances of a kernel, or different kernels with team reductions, would use the same locations. Since we now have the kernel launch environment, we can allocate dynamic memory per-launch, allowing us to move all the state into a non-racy place. Fixes: #70249
1 parent 7dc20ab commit 1859bd4

File tree

9 files changed

+231
-195
lines changed

9 files changed

+231
-195
lines changed

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 28 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -803,8 +803,30 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
803803
if (!IsSPMD)
804804
emitGenericVarsEpilog(CGF);
805805

806+
// This is temporary until we remove the fixed sized buffer.
807+
ASTContext &C = CGM.getContext();
808+
RecordDecl *StaticRD = C.buildImplicitRecord(
809+
"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
810+
StaticRD->startDefinition();
811+
for (const RecordDecl *TeamReductionRec : TeamsReductions) {
812+
QualType RecTy = C.getRecordType(TeamReductionRec);
813+
auto *Field = FieldDecl::Create(
814+
C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
815+
C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
816+
/*BW=*/nullptr, /*Mutable=*/false,
817+
/*InitStyle=*/ICIS_NoInit);
818+
Field->setAccess(AS_public);
819+
StaticRD->addDecl(Field);
820+
}
821+
StaticRD->completeDefinition();
822+
QualType StaticTy = C.getRecordType(StaticRD);
823+
llvm::Type *LLVMReductionsBufferTy =
824+
CGM.getTypes().ConvertTypeForMem(StaticTy);
825+
const auto &DL = CGM.getModule().getDataLayout();
826+
uint64_t BufferSize =
827+
DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
806828
CGBuilderTy &Bld = CGF.Builder;
807-
OMPBuilder.createTargetDeinit(Bld);
829+
OMPBuilder.createTargetDeinit(Bld, BufferSize);
808830
}
809831

810832
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -2998,15 +3020,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
29983020
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
29993021
C.getLangOpts().OpenMPCUDAReductionBufNum);
30003022
TeamsReductions.push_back(TeamReductionRec);
3001-
if (!KernelTeamsReductionPtr) {
3002-
KernelTeamsReductionPtr = new llvm::GlobalVariable(
3003-
CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3004-
llvm::GlobalValue::InternalLinkage, nullptr,
3005-
"_openmp_teams_reductions_buffer_$_$ptr");
3006-
}
3007-
llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3008-
Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
3009-
/*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3023+
auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
3024+
OMPBuilder.getOrCreateRuntimeFunction(
3025+
CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
3026+
{}, "_openmp_teams_reductions_buffer_$_$ptr");
30103027
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
30113028
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
30123029
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
@@ -3021,7 +3038,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
30213038
llvm::Value *Args[] = {
30223039
RTLoc,
30233040
ThreadId,
3024-
GlobalBufferPtr,
3041+
KernelTeamsReductionPtr,
30253042
CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
30263043
RL,
30273044
ShuffleAndReduceFn,
@@ -3654,42 +3671,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
36543671
CGOpenMPRuntime::processRequiresDirective(D);
36553672
}
36563673

3657-
void CGOpenMPRuntimeGPU::clear() {
3658-
3659-
if (!TeamsReductions.empty()) {
3660-
ASTContext &C = CGM.getContext();
3661-
RecordDecl *StaticRD = C.buildImplicitRecord(
3662-
"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3663-
StaticRD->startDefinition();
3664-
for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3665-
QualType RecTy = C.getRecordType(TeamReductionRec);
3666-
auto *Field = FieldDecl::Create(
3667-
C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3668-
C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3669-
/*BW=*/nullptr, /*Mutable=*/false,
3670-
/*InitStyle=*/ICIS_NoInit);
3671-
Field->setAccess(AS_public);
3672-
StaticRD->addDecl(Field);
3673-
}
3674-
StaticRD->completeDefinition();
3675-
QualType StaticTy = C.getRecordType(StaticRD);
3676-
llvm::Type *LLVMReductionsBufferTy =
3677-
CGM.getTypes().ConvertTypeForMem(StaticTy);
3678-
// FIXME: nvlink does not handle weak linkage correctly (object with the
3679-
// different size are reported as erroneous).
3680-
// Restore CommonLinkage as soon as nvlink is fixed.
3681-
auto *GV = new llvm::GlobalVariable(
3682-
CGM.getModule(), LLVMReductionsBufferTy,
3683-
/*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3684-
llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3685-
"_openmp_teams_reductions_buffer_$_");
3686-
KernelTeamsReductionPtr->setInitializer(
3687-
llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3688-
CGM.VoidPtrTy));
3689-
}
3690-
CGOpenMPRuntime::clear();
3691-
}
3692-
36933674
llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
36943675
CGBuilderTy &Bld = CGF.Builder;
36953676
llvm::Module *M = &CGF.CGM.getModule();

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
130130

131131
public:
132132
explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
133-
void clear() override;
134133

135134
bool isGPU() const override { return true; };
136135

@@ -386,7 +385,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
386385
/// Maps the function to the list of the globalized variables with their
387386
/// addresses.
388387
llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
389-
llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
390388
/// List of the records with the list of fields for the reductions across the
391389
/// teams. Used to build the intermediate buffer for the fast teams
392390
/// reductions.

0 commit comments

Comments
 (0)