@@ -801,10 +801,12 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
801
801
llvm::Type *LLVMReductionsBufferTy =
802
802
CGM.getTypes ().ConvertTypeForMem (StaticTy);
803
803
const auto &DL = CGM.getModule ().getDataLayout ();
804
- uint64_t BufferSize =
804
+ uint64_t ReductionDataSize =
805
805
DL.getTypeAllocSize (LLVMReductionsBufferTy).getFixedValue ();
806
806
CGBuilderTy &Bld = CGF.Builder ;
807
- OMPBuilder.createTargetDeinit (Bld, BufferSize);
807
+ OMPBuilder.createTargetDeinit (Bld, ReductionDataSize,
808
+ C.getLangOpts ().OpenMPCUDAReductionBufNum );
809
+ TeamsReductions.clear ();
808
810
}
809
811
810
812
void CGOpenMPRuntimeGPU::emitSPMDKernel (const OMPExecutableDirective &D,
@@ -2828,15 +2830,25 @@ void CGOpenMPRuntimeGPU::emitReduction(
2828
2830
assert ((TeamsReduction || ParallelReduction) &&
2829
2831
" Invalid reduction selection in emitReduction." );
2830
2832
2833
+ llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2834
+ llvm::SmallVector<const ValueDecl *, 4 > PrivatesReductions (Privates.size ());
2835
+ int Cnt = 0 ;
2836
+ for (const Expr *DRE : Privates) {
2837
+ PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl ();
2838
+ ++Cnt;
2839
+ }
2840
+
2841
+ ASTContext &C = CGM.getContext ();
2842
+ const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars (
2843
+ CGM.getContext (), PrivatesReductions, std::nullopt, VarFieldMap, 1 );
2844
+
2831
2845
// Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2832
2846
// RedList, shuffle_reduce_func, interwarp_copy_func);
2833
2847
// or
2834
2848
// Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2835
2849
llvm::Value *RTLoc = emitUpdateLocation (CGF, Loc);
2836
- llvm::Value *ThreadId = getThreadID (CGF, Loc);
2837
2850
2838
2851
llvm::Value *Res;
2839
- ASTContext &C = CGM.getContext ();
2840
2852
// 1. Build a list of reduction variables.
2841
2853
// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2842
2854
auto Size = RHSExprs.size ();
@@ -2878,19 +2890,17 @@ void CGOpenMPRuntimeGPU::emitReduction(
2878
2890
llvm::Function *ReductionFn = emitReductionFunction (
2879
2891
CGF.CurFn ->getName (), Loc, CGF.ConvertTypeForMem (ReductionArrayTy),
2880
2892
Privates, LHSExprs, RHSExprs, ReductionOps);
2881
- llvm::Value *ReductionArrayTySize = CGF.getTypeSize (ReductionArrayTy);
2893
+ llvm::Value *ReductionDataSize =
2894
+ CGF.getTypeSize (C.getRecordType (ReductionRec));
2895
+ ReductionDataSize =
2896
+ CGF.Builder .CreateSExtOrTrunc (ReductionDataSize, CGF.Int64Ty );
2882
2897
llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction (
2883
2898
CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
2884
2899
llvm::Value *InterWarpCopyFn =
2885
2900
emitInterWarpCopyFunction (CGM, Privates, ReductionArrayTy, Loc);
2886
2901
2887
2902
if (ParallelReduction) {
2888
- llvm::Value *Args[] = {RTLoc,
2889
- ThreadId,
2890
- CGF.Builder .getInt32 (RHSExprs.size ()),
2891
- ReductionArrayTySize,
2892
- RL,
2893
- ShuffleAndReduceFn,
2903
+ llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
2894
2904
InterWarpCopyFn};
2895
2905
2896
2906
Res = CGF.EmitRuntimeCall (
@@ -2899,37 +2909,27 @@ void CGOpenMPRuntimeGPU::emitReduction(
2899
2909
Args);
2900
2910
} else {
2901
2911
assert (TeamsReduction && " expected teams reduction." );
2902
- llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2903
- llvm::SmallVector<const ValueDecl *, 4 > PrivatesReductions (Privates.size ());
2904
- int Cnt = 0 ;
2905
- for (const Expr *DRE : Privates) {
2906
- PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl ();
2907
- ++Cnt;
2908
- }
2909
- const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars (
2910
- CGM.getContext (), PrivatesReductions, std::nullopt, VarFieldMap,
2911
- C.getLangOpts ().OpenMPCUDAReductionBufNum );
2912
- TeamsReductions.push_back (TeamReductionRec);
2912
+ TeamsReductions.push_back (ReductionRec);
2913
2913
auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall (
2914
2914
OMPBuilder.getOrCreateRuntimeFunction (
2915
2915
CGM.getModule (), OMPRTL___kmpc_reduction_get_fixed_buffer),
2916
2916
{}, " _openmp_teams_reductions_buffer_$_$ptr" );
2917
2917
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction (
2918
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec , VarFieldMap);
2918
+ CGM, Privates, ReductionArrayTy, Loc, ReductionRec , VarFieldMap);
2919
2919
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction (
2920
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec , VarFieldMap,
2920
+ CGM, Privates, ReductionArrayTy, Loc, ReductionRec , VarFieldMap,
2921
2921
ReductionFn);
2922
2922
llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction (
2923
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec , VarFieldMap);
2923
+ CGM, Privates, ReductionArrayTy, Loc, ReductionRec , VarFieldMap);
2924
2924
llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction (
2925
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec , VarFieldMap,
2925
+ CGM, Privates, ReductionArrayTy, Loc, ReductionRec , VarFieldMap,
2926
2926
ReductionFn);
2927
2927
2928
2928
llvm::Value *Args[] = {
2929
2929
RTLoc,
2930
- ThreadId,
2931
2930
KernelTeamsReductionPtr,
2932
2931
CGF.Builder .getInt32 (C.getLangOpts ().OpenMPCUDAReductionBufNum ),
2932
+ ReductionDataSize,
2933
2933
RL,
2934
2934
ShuffleAndReduceFn,
2935
2935
InterWarpCopyFn,
0 commit comments