-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[OpenMP][FIX] Allocate per launch memory for GPU team reductions #70752
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mlir-llvm @llvm/pr-subscribers-openmp Author: Johannes Doerfert (jdoerfert) ChangesFirst commit is part of #70401 Patch is 4.72 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70752.diff 186 Files Affected:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 9d00ebae702802a..de028b0209c171a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -803,8 +803,30 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
if (!IsSPMD)
emitGenericVarsEpilog(CGF);
+ // This is temporary until we remove the fixed sized buffer.
+ ASTContext &C = CGM.getContext();
+ RecordDecl *StaticRD = C.buildImplicitRecord(
+ "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
+ StaticRD->startDefinition();
+ for (const RecordDecl *TeamReductionRec : TeamsReductions) {
+ QualType RecTy = C.getRecordType(TeamReductionRec);
+ auto *Field = FieldDecl::Create(
+ C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
+ C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
+ /*BW=*/nullptr, /*Mutable=*/false,
+ /*InitStyle=*/ICIS_NoInit);
+ Field->setAccess(AS_public);
+ StaticRD->addDecl(Field);
+ }
+ StaticRD->completeDefinition();
+ QualType StaticTy = C.getRecordType(StaticRD);
+ llvm::Type *LLVMReductionsBufferTy =
+ CGM.getTypes().ConvertTypeForMem(StaticTy);
+ const auto &DL = CGM.getModule().getDataLayout();
+ uint64_t BufferSize =
+ DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
CGBuilderTy &Bld = CGF.Builder;
- OMPBuilder.createTargetDeinit(Bld);
+ OMPBuilder.createTargetDeinit(Bld, BufferSize);
}
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -2998,15 +3020,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
C.getLangOpts().OpenMPCUDAReductionBufNum);
TeamsReductions.push_back(TeamReductionRec);
- if (!KernelTeamsReductionPtr) {
- KernelTeamsReductionPtr = new llvm::GlobalVariable(
- CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
- llvm::GlobalValue::InternalLinkage, nullptr,
- "_openmp_teams_reductions_buffer_$_$ptr");
- }
- llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
- Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
- /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
+ auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
+ OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
+ {}, "_openmp_teams_reductions_buffer_$_$ptr");
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
@@ -3021,7 +3038,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
llvm::Value *Args[] = {
RTLoc,
ThreadId,
- GlobalBufferPtr,
+ KernelTeamsReductionPtr,
CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
RL,
ShuffleAndReduceFn,
@@ -3654,42 +3671,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
CGOpenMPRuntime::processRequiresDirective(D);
}
-void CGOpenMPRuntimeGPU::clear() {
-
- if (!TeamsReductions.empty()) {
- ASTContext &C = CGM.getContext();
- RecordDecl *StaticRD = C.buildImplicitRecord(
- "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
- StaticRD->startDefinition();
- for (const RecordDecl *TeamReductionRec : TeamsReductions) {
- QualType RecTy = C.getRecordType(TeamReductionRec);
- auto *Field = FieldDecl::Create(
- C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
- C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
- /*BW=*/nullptr, /*Mutable=*/false,
- /*InitStyle=*/ICIS_NoInit);
- Field->setAccess(AS_public);
- StaticRD->addDecl(Field);
- }
- StaticRD->completeDefinition();
- QualType StaticTy = C.getRecordType(StaticRD);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- // FIXME: nvlink does not handle weak linkage correctly (object with the
- // different size are reported as erroneous).
- // Restore CommonLinkage as soon as nvlink is fixed.
- auto *GV = new llvm::GlobalVariable(
- CGM.getModule(), LLVMReductionsBufferTy,
- /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
- llvm::Constant::getNullValue(LLVMReductionsBufferTy),
- "_openmp_teams_reductions_buffer_$_");
- KernelTeamsReductionPtr->setInitializer(
- llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
- CGM.VoidPtrTy));
- }
- CGOpenMPRuntime::clear();
-}
-
llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Module *M = &CGF.CGM.getModule();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 46e1361f2f895ba..141436f26230dde 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -130,7 +130,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
public:
explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
- void clear() override;
bool isGPU() const override { return true; };
@@ -386,7 +385,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// Maps the function to the list of the globalized variables with their
/// addresses.
llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
- llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
/// List of the records with the list of fields for the reductions across the
/// teams. Used to build the intermediate buffer for the fast teams
/// reductions.
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 75f9e152dca9297..145f4dc4670081d 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4249,12 +4249,15 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
getCurCapturedRegion()->TheCapturedDecl->addAttr(
AlwaysInlineAttr::CreateImplicit(
Context, {}, AlwaysInlineAttr::Keyword_forceinline));
- Sema::CapturedParamNameType ParamsTarget[] = {
- std::make_pair(StringRef(), QualType()) // __context with shared vars
- };
+ SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+ if (getLangOpts().OpenMPIsTargetDevice)
+ ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
+ ParamsTarget.push_back(
+ std::make_pair(StringRef(), QualType())); // __context with shared vars;
// Start a captured region for 'target' with no implicit parameters.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
- ParamsTarget, /*OpenMPCaptureLevel=*/1);
+ ParamsTarget,
+ /*OpenMPCaptureLevel=*/1);
Sema::CapturedParamNameType ParamsTeamsOrParallel[] = {
std::make_pair(".global_tid.", KmpInt32PtrTy),
std::make_pair(".bound_tid.", KmpInt32PtrTy),
@@ -4293,8 +4296,13 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
getCurCapturedRegion()->TheCapturedDecl->addAttr(
AlwaysInlineAttr::CreateImplicit(
Context, {}, AlwaysInlineAttr::Keyword_forceinline));
+ SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+ if (getLangOpts().OpenMPIsTargetDevice)
+ ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
+ ParamsTarget.push_back(
+ std::make_pair(StringRef(), QualType())); // __context with shared vars;
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
- std::make_pair(StringRef(), QualType()),
+ ParamsTarget,
/*OpenMPCaptureLevel=*/1);
break;
}
@@ -4499,9 +4507,11 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
getCurCapturedRegion()->TheCapturedDecl->addAttr(
AlwaysInlineAttr::CreateImplicit(
Context, {}, AlwaysInlineAttr::Keyword_forceinline));
- Sema::CapturedParamNameType ParamsTarget[] = {
- std::make_pair(StringRef(), QualType()) // __context with shared vars
- };
+ SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+ if (getLangOpts().OpenMPIsTargetDevice)
+ ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
+ ParamsTarget.push_back(
+ std::make_pair(StringRef(), QualType())); // __context with shared vars;
// Start a captured region for 'target' with no implicit parameters.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
ParamsTarget, /*OpenMPCaptureLevel=*/1);
diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp
index 90d2ebdf26bd645..3ea2d107f072adb 100644
--- a/clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -29,15 +29,18 @@ int test_amdgcn_target_tid_threads_simd() {
#endif
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z30test_amdgcn_target_tid_threadsv_l14
-// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-NEXT: [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARR_ADDR]] to ptr
// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z30test_amdgcn_target_tid_threadsv_l14_kernel_environment to ptr))
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z30test_amdgcn_target_tid_threadsv_l14_kernel_environment to ptr), ptr [[DYN_PTR]])
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
@@ -66,19 +69,22 @@ int test_amdgcn_target_tid_threads_simd() {
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z35test_amdgcn_target_tid_threads_simdv_l23
-// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR1:[0-9]+]] {
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-NEXT: [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARR_ADDR]] to ptr
// CHECK-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z35test_amdgcn_target_tid_threads_simdv_l23_kernel_environment to ptr))
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z35test_amdgcn_target_tid_threads_simdv_l23_kernel_environment to ptr), ptr [[DYN_PTR]])
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
diff --git a/clang/test/OpenMP/amdgcn_target_device_vla.cpp b/clang/test/OpenMP/amdgcn_target_device_vla.cpp
index b2b630b546713dd..de150a0fcb4afd2 100644
--- a/clang/test/OpenMP/amdgcn_target_device_vla.cpp
+++ b/clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -97,21 +97,24 @@ int main() {
#endif
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
-// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[N:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
// CHECK-NEXT: [[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
// CHECK-NEXT: [[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
// CHECK-NEXT: [[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12_kernel_environment to ptr))
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12_kernel_environment to ptr), ptr [[DYN_PTR]])
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
@@ -174,26 +177,29 @@ int main() {
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30
-// CHECK-SAME: (i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[M_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[M_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-NEXT: [[M_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_ADDR]] to ptr
// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr
// CHECK-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr
// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr
// CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
// CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RESULT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_kernel_environment to ptr))
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_kernel_environment to ptr), ptr [[DYN_PTR]])
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
@@ -540,26 +546,29 @@ int main() {
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l52
-// CHECK-SAME: (i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[M_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[M_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
// CHECK-NEXT: [[M_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_ADDR]] to ptr
// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr
// CHECK-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr
// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr
// CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
// CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load...
[truncated]
|
5568f0a
to
4d864e5
Compare
4d864e5
to
04aafdc
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG with some nits
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
Outdated
Show resolved
Hide resolved
We used to perform team reduction on global memory allocated in the runtime and by clang. This was racy as multiple instances of a kernel, or different kernels with team reductions, would use the same locations. Since we now have the kernel launch environment, we can allocate dynamic memory per-launch, allowing us to move all the state into a non-racy place. Fixes: llvm#70249
04aafdc
to
1859bd4
Compare
Revert items 2 and 3 and work separately with 1 to get time to integrate them into ASO [OpenMP] Introduce the KernelLaunchEnvironment as implicit [OpenMP][FIX] Allocate per launch memory for GPU team reductions (llvm#70752) [OpenMP][FIX] Do not add implicit argument to device Ctors and Dtors Change-Id: I987405a1541ed3102ca78430496f611e565db9a0
…m#70752) We used to perform team reduction on global memory allocated in the runtime and by clang. This was racy as multiple instances of a kernel, or different kernels with team reductions, would use the same locations. Since we now have the kernel launch environment, we can allocate dynamic memory per-launch, allowing us to move all the state into a non-racy place. Fixes: llvm#70249 Change-Id: Id8a5932a1cde8cfcbb0e17655ef3f390f6f4d050
We used to perform team reduction on global memory allocated in the
runtime and by clang. This was racy as multiple instances of a kernel,
or different kernels with team reductions, would use the same locations.
Since we now have the kernel launch environment, we can allocate dynamic
memory per-launch, allowing us to move all the state into a non-racy
place.
Fixes: #70249