@@ -921,10 +921,12 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
921
921
llvm::Type *LLVMReductionsBufferTy =
922
922
CGM.getTypes ().ConvertTypeForMem (StaticTy);
923
923
const auto &DL = CGM.getModule ().getDataLayout ();
924
- uint64_t BufferSize =
924
+ uint64_t ReductionDataSize =
925
925
DL.getTypeAllocSize (LLVMReductionsBufferTy).getFixedValue ();
926
926
CGBuilderTy &Bld = CGF.Builder ;
927
- OMPBuilder.createTargetDeinit (Bld, BufferSize);
927
+ OMPBuilder.createTargetDeinit (Bld, ReductionDataSize,
928
+ C.getLangOpts ().OpenMPCUDAReductionBufNum );
929
+ TeamsReductions.clear ();
928
930
}
929
931
930
932
void CGOpenMPRuntimeGPU::emitSPMDKernel (const OMPExecutableDirective &D,
@@ -3023,15 +3025,25 @@ void CGOpenMPRuntimeGPU::emitReduction(
3023
3025
assert ((TeamsReduction || ParallelReduction) &&
3024
3026
" Invalid reduction selection in emitReduction." );
3025
3027
3028
+ llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3029
+ llvm::SmallVector<const ValueDecl *, 4 > PrivatesReductions (Privates.size ());
3030
+ int Cnt = 0 ;
3031
+ for (const Expr *DRE : Privates) {
3032
+ PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl ();
3033
+ ++Cnt;
3034
+ }
3035
+
3036
+ ASTContext &C = CGM.getContext ();
3037
+ const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars (
3038
+ CGM.getContext (), PrivatesReductions, std::nullopt, VarFieldMap, 1 );
3039
+
3026
3040
// Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3027
3041
// RedList, shuffle_reduce_func, interwarp_copy_func);
3028
3042
// or
3029
3043
// Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3030
3044
llvm::Value *RTLoc = emitUpdateLocation (CGF, Loc);
3031
- llvm::Value *ThreadId = getThreadID (CGF, Loc);
3032
3045
3033
3046
llvm::Value *Res;
3034
- ASTContext &C = CGM.getContext ();
3035
3047
// 1. Build a list of reduction variables.
3036
3048
// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3037
3049
auto Size = RHSExprs.size ();
@@ -3073,19 +3085,17 @@ void CGOpenMPRuntimeGPU::emitReduction(
3073
3085
llvm::Function *ReductionFn = emitReductionFunction (
3074
3086
CGF.CurFn ->getName (), Loc, CGF.ConvertTypeForMem (ReductionArrayTy),
3075
3087
Privates, LHSExprs, RHSExprs, ReductionOps);
3076
- llvm::Value *ReductionArrayTySize = CGF.getTypeSize (ReductionArrayTy);
3088
+ llvm::Value *ReductionDataSize =
3089
+ CGF.getTypeSize (C.getRecordType (ReductionRec));
3090
+ ReductionDataSize =
3091
+ CGF.Builder .CreateSExtOrTrunc (ReductionDataSize, CGF.Int64Ty );
3077
3092
llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction (
3078
3093
CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3079
3094
llvm::Value *InterWarpCopyFn =
3080
3095
emitInterWarpCopyFunction (CGM, Privates, ReductionArrayTy, Loc);
3081
3096
3082
3097
if (ParallelReduction) {
3083
- llvm::Value *Args[] = {RTLoc,
3084
- ThreadId,
3085
- CGF.Builder .getInt32 (RHSExprs.size ()),
3086
- ReductionArrayTySize,
3087
- RL,
3088
- ShuffleAndReduceFn,
3098
+ llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
3089
3099
InterWarpCopyFn};
3090
3100
3091
3101
Res = CGF.EmitRuntimeCall (
@@ -3094,37 +3104,27 @@ void CGOpenMPRuntimeGPU::emitReduction(
3094
3104
Args);
3095
3105
} else {
3096
3106
assert (TeamsReduction && " expected teams reduction." );
3097
- llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3098
- llvm::SmallVector<const ValueDecl *, 4 > PrivatesReductions (Privates.size ());
3099
- int Cnt = 0 ;
3100
- for (const Expr *DRE : Privates) {
3101
- PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl ();
3102
- ++Cnt;
3103
- }
3104
- const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars (
3105
- CGM.getContext (), PrivatesReductions, std::nullopt, VarFieldMap,
3106
- C.getLangOpts ().OpenMPCUDAReductionBufNum );
3107
- TeamsReductions.push_back (TeamReductionRec);
3107
+ TeamsReductions.push_back (ReductionRec);
3108
3108
auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall (
3109
3109
OMPBuilder.getOrCreateRuntimeFunction (
3110
3110
CGM.getModule (), OMPRTL___kmpc_reduction_get_fixed_buffer),
3111
3111
{}, " _openmp_teams_reductions_buffer_$_$ptr" );
3112
3112
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction (
3113
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec , VarFieldMap);
3113
+ CGM, Privates, ReductionArrayTy, Loc, ReductionRec , VarFieldMap);
3114
3114
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction (
3115
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec , VarFieldMap,
3115
+ CGM, Privates, ReductionArrayTy, Loc, ReductionRec , VarFieldMap,
3116
3116
ReductionFn);
3117
3117
llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction (
3118
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec , VarFieldMap);
3118
+ CGM, Privates, ReductionArrayTy, Loc, ReductionRec , VarFieldMap);
3119
3119
llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction (
3120
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec , VarFieldMap,
3120
+ CGM, Privates, ReductionArrayTy, Loc, ReductionRec , VarFieldMap,
3121
3121
ReductionFn);
3122
3122
3123
3123
llvm::Value *Args[] = {
3124
3124
RTLoc,
3125
- ThreadId,
3126
3125
KernelTeamsReductionPtr,
3127
3126
CGF.Builder .getInt32 (C.getLangOpts ().OpenMPCUDAReductionBufNum ),
3127
+ ReductionDataSize,
3128
3128
RL,
3129
3129
ShuffleAndReduceFn,
3130
3130
InterWarpCopyFn,
0 commit comments