Skip to content

Commit 2d8a42b

Browse files
jdoerfertronlieb
authored andcommitted
[OpenMP][FIX] Allocate per launch memory for GPU team reductions (llvm#70752)
We used to perform team reduction on global memory allocated in the runtime and by clang. This was racy as multiple instances of a kernel, or different kernels with team reductions, would use the same locations. Since we now have the kernel launch environment, we can allocate dynamic memory per-launch, allowing us to move all the state into a non-racy place. Fixes: llvm#70249 Change-Id: Id8a5932a1cde8cfcbb0e17655ef3f390f6f4d050
1 parent 5d3ba1d commit 2d8a42b

File tree

8 files changed

+195
-195
lines changed

8 files changed

+195
-195
lines changed

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 28 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -923,8 +923,30 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
923923
if (!IsSPMD)
924924
emitGenericVarsEpilog(CGF);
925925

926+
// This is temporary until we remove the fixed sized buffer.
927+
ASTContext &C = CGM.getContext();
928+
RecordDecl *StaticRD = C.buildImplicitRecord(
929+
"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
930+
StaticRD->startDefinition();
931+
for (const RecordDecl *TeamReductionRec : TeamsReductions) {
932+
QualType RecTy = C.getRecordType(TeamReductionRec);
933+
auto *Field = FieldDecl::Create(
934+
C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
935+
C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
936+
/*BW=*/nullptr, /*Mutable=*/false,
937+
/*InitStyle=*/ICIS_NoInit);
938+
Field->setAccess(AS_public);
939+
StaticRD->addDecl(Field);
940+
}
941+
StaticRD->completeDefinition();
942+
QualType StaticTy = C.getRecordType(StaticRD);
943+
llvm::Type *LLVMReductionsBufferTy =
944+
CGM.getTypes().ConvertTypeForMem(StaticTy);
945+
const auto &DL = CGM.getModule().getDataLayout();
946+
uint64_t BufferSize =
947+
DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
926948
CGBuilderTy &Bld = CGF.Builder;
927-
OMPBuilder.createTargetDeinit(Bld);
949+
OMPBuilder.createTargetDeinit(Bld, BufferSize);
928950
}
929951

930952
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -3193,15 +3215,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
31933215
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
31943216
C.getLangOpts().OpenMPCUDAReductionBufNum);
31953217
TeamsReductions.push_back(TeamReductionRec);
3196-
if (!KernelTeamsReductionPtr) {
3197-
KernelTeamsReductionPtr = new llvm::GlobalVariable(
3198-
CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3199-
llvm::GlobalValue::InternalLinkage, nullptr,
3200-
"_openmp_teams_reductions_buffer_$_$ptr");
3201-
}
3202-
llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3203-
Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
3204-
/*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3218+
auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
3219+
OMPBuilder.getOrCreateRuntimeFunction(
3220+
CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
3221+
{}, "_openmp_teams_reductions_buffer_$_$ptr");
32053222
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
32063223
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
32073224
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
@@ -3216,7 +3233,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
32163233
llvm::Value *Args[] = {
32173234
RTLoc,
32183235
ThreadId,
3219-
GlobalBufferPtr,
3236+
KernelTeamsReductionPtr,
32203237
CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
32213238
RL,
32223239
ShuffleAndReduceFn,
@@ -3859,42 +3876,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
38593876
CGOpenMPRuntime::processRequiresDirective(D);
38603877
}
38613878

3862-
void CGOpenMPRuntimeGPU::clear() {
3863-
3864-
if (!TeamsReductions.empty()) {
3865-
ASTContext &C = CGM.getContext();
3866-
RecordDecl *StaticRD = C.buildImplicitRecord(
3867-
"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3868-
StaticRD->startDefinition();
3869-
for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3870-
QualType RecTy = C.getRecordType(TeamReductionRec);
3871-
auto *Field = FieldDecl::Create(
3872-
C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3873-
C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3874-
/*BW=*/nullptr, /*Mutable=*/false,
3875-
/*InitStyle=*/ICIS_NoInit);
3876-
Field->setAccess(AS_public);
3877-
StaticRD->addDecl(Field);
3878-
}
3879-
StaticRD->completeDefinition();
3880-
QualType StaticTy = C.getRecordType(StaticRD);
3881-
llvm::Type *LLVMReductionsBufferTy =
3882-
CGM.getTypes().ConvertTypeForMem(StaticTy);
3883-
// FIXME: nvlink does not handle weak linkage correctly (object with the
3884-
// different size are reported as erroneous).
3885-
// Restore CommonLinkage as soon as nvlink is fixed.
3886-
auto *GV = new llvm::GlobalVariable(
3887-
CGM.getModule(), LLVMReductionsBufferTy,
3888-
/*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3889-
llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3890-
"_openmp_teams_reductions_buffer_$_");
3891-
KernelTeamsReductionPtr->setInitializer(
3892-
llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3893-
CGM.VoidPtrTy));
3894-
}
3895-
CGOpenMPRuntime::clear();
3896-
}
3897-
38983879
llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
38993880
CGBuilderTy &Bld = CGF.Builder;
39003881
llvm::Module *M = &CGF.CGM.getModule();

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
133133

134134
public:
135135
explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
136-
void clear() override;
137136

138137
bool isGPU() const override { return true; };
139138

@@ -433,7 +432,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
433432
/// Maps the function to the list of the globalized variables with their
434433
/// addresses.
435434
llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
436-
llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
437435
/// List of the records with the list of fields for the reductions across the
438436
/// teams. Used to build the intermediate buffer for the fast teams
439437
/// reductions.

0 commit comments

Comments
 (0)