@@ -1523,21 +1523,49 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1523
1523
// TODO: Is that needed?
1524
1524
CodeGenFunction::OMPPrivateScope PrivateArgScope (CGF);
1525
1525
1526
+ // Store addresses of global arguments to pass to the parallel call.
1526
1527
Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca (
1527
1528
llvm::ArrayType::get (CGM.VoidPtrTy , CapturedVars.size ()),
1528
1529
" captured_vars_addrs" );
1529
- // There's something to share.
1530
+
1531
+ // Store globalized values to push, pop through the global stack.
1532
+ llvm::SmallDenseMap<llvm::Value *, unsigned > GlobalValuesToSizeMap;
1530
1533
if (!CapturedVars.empty ()) {
1531
- // Prepare for parallel region. Indicate the outlined function.
1532
1534
ASTContext &Ctx = CGF.getContext ();
1533
1535
unsigned Idx = 0 ;
1534
1536
for (llvm::Value *V : CapturedVars) {
1535
1537
Address Dst = Bld.CreateConstArrayGEP (CapturedVarsAddrs, Idx);
1536
1538
llvm::Value *PtrV;
1537
1539
if (V->getType ()->isIntegerTy ())
1538
1540
PtrV = Bld.CreateIntToPtr (V, CGF.VoidPtrTy );
1539
- else
1540
- PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast (V, CGF.VoidPtrTy );
1541
+ else {
1542
+ assert (V->getType ()->isPointerTy () &&
1543
+ " Expected Pointer Type to globalize." );
1544
+ // Globalize and store pointer.
1545
+ llvm::Type *PtrElemTy = V->getType ()->getPointerElementType ();
1546
+ auto &DL = CGM.getDataLayout ();
1547
+ unsigned GlobalSize = DL.getTypeAllocSize (PtrElemTy);
1548
+
1549
+ // Use shared memory to store globalized pointer values, for now this
1550
+ // should be the outlined args aggregate struct.
1551
+ llvm::Value *GlobalSizeArg[] = {
1552
+ llvm::ConstantInt::get (CGM.SizeTy , GlobalSize)};
1553
+ llvm::Value *GlobalValue = CGF.EmitRuntimeCall (
1554
+ OMPBuilder.getOrCreateRuntimeFunction (CGM.getModule (),
1555
+ OMPRTL___kmpc_alloc_shared),
1556
+ GlobalSizeArg);
1557
+ GlobalValuesToSizeMap[GlobalValue] = GlobalSize;
1558
+
1559
+ llvm::Value *CapturedVarVal = Bld.CreateAlignedLoad (
1560
+ PtrElemTy, V, DL.getABITypeAlign (PtrElemTy));
1561
+ llvm::Value *GlobalValueCast =
1562
+ Bld.CreatePointerBitCastOrAddrSpaceCast (
1563
+ GlobalValue, PtrElemTy->getPointerTo ());
1564
+ Bld.CreateDefaultAlignedStore (CapturedVarVal, GlobalValueCast);
1565
+
1566
+ PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast (GlobalValue,
1567
+ CGF.VoidPtrTy );
1568
+ }
1541
1569
CGF.EmitStoreOfScalar (PtrV, Dst, /* Volatile=*/ false ,
1542
1570
Ctx.getPointerType (Ctx.VoidPtrTy ));
1543
1571
++Idx;
@@ -1550,8 +1578,9 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1550
1578
/* isSigned */ false );
1551
1579
else
1552
1580
IfCondVal = llvm::ConstantInt::get (CGF.Int32Ty , 1 );
1553
-
1554
1581
assert (IfCondVal && " Expected a value" );
1582
+
1583
+ // Create the parallel call.
1555
1584
llvm::Value *RTLoc = emitUpdateLocation (CGF, Loc);
1556
1585
llvm::Value *Args[] = {
1557
1586
RTLoc,
@@ -1567,6 +1596,14 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1567
1596
CGF.EmitRuntimeCall (OMPBuilder.getOrCreateRuntimeFunction (
1568
1597
CGM.getModule (), OMPRTL___kmpc_parallel_51),
1569
1598
Args);
1599
+
1600
+ // Pop any globalized values from the global stack.
1601
+ for (const auto &GV : GlobalValuesToSizeMap) {
1602
+ CGF.EmitRuntimeCall (
1603
+ OMPBuilder.getOrCreateRuntimeFunction (CGM.getModule (),
1604
+ OMPRTL___kmpc_free_shared),
1605
+ {GV.first , llvm::ConstantInt::get (CGM.SizeTy , GV.second )});
1606
+ }
1570
1607
};
1571
1608
1572
1609
RegionCodeGenTy RCG (ParallelGen);
@@ -3477,7 +3514,6 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3477
3514
D.getBeginLoc (), D.getBeginLoc ());
3478
3515
3479
3516
const auto *RD = CS.getCapturedRecordDecl ();
3480
- auto CurField = RD->field_begin ();
3481
3517
3482
3518
Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca (CGF.Int32Ty ,
3483
3519
/* Name=*/ " .zero.addr" );
@@ -3489,7 +3525,6 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3489
3525
Args.emplace_back (ZeroAddr.getPointer ());
3490
3526
3491
3527
CGBuilderTy &Bld = CGF.Builder ;
3492
- auto CI = CS.capture_begin ();
3493
3528
3494
3529
// Use global memory for data sharing.
3495
3530
// Handle passing of global args to workers.
@@ -3504,55 +3539,33 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3504
3539
// Retrieve the shared variables from the list of references returned
3505
3540
// by the runtime. Pass the variables to the outlined function.
3506
3541
Address SharedArgListAddress = Address::invalid ();
3507
- if (CS.capture_size () > 0 ||
3508
- isOpenMPLoopBoundSharingDirective (D.getDirectiveKind ())) {
3542
+ if (CS.capture_size () > 0 ) {
3509
3543
SharedArgListAddress = CGF.EmitLoadOfPointer (
3510
3544
GlobalArgs, CGF.getContext ()
3511
3545
.getPointerType (CGF.getContext ().getPointerType (
3512
3546
CGF.getContext ().VoidPtrTy ))
3513
3547
.castAs <PointerType>());
3514
- }
3515
- unsigned Idx = 0 ;
3516
- if (isOpenMPLoopBoundSharingDirective (D.getDirectiveKind ())) {
3517
- Address Src = Bld.CreateConstInBoundsGEP (SharedArgListAddress, Idx);
3548
+ const auto *CI = CS.capture_begin ();
3549
+ // Load the outlined arg aggregate struct.
3550
+ ASTContext &CGFContext = CGF.getContext ();
3551
+ QualType RecordPointerTy =
3552
+ CGFContext.getPointerType (CGFContext.getRecordType (RD));
3553
+ Address Src = Bld.CreateConstInBoundsGEP (SharedArgListAddress, /* Index=*/ 0 );
3518
3554
Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast (
3519
- Src, CGF.SizeTy ->getPointerTo ());
3520
- llvm::Value *LB = CGF.EmitLoadOfScalar (
3521
- TypedAddress,
3522
- /* Volatile=*/ false ,
3523
- CGF.getContext ().getPointerType (CGF.getContext ().getSizeType ()),
3524
- cast<OMPLoopDirective>(D).getLowerBoundVariable ()->getExprLoc ());
3525
- Args.emplace_back (LB);
3526
- ++Idx;
3527
- Src = Bld.CreateConstInBoundsGEP (SharedArgListAddress, Idx);
3528
- TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast (
3529
- Src, CGF.SizeTy ->getPointerTo ());
3530
- llvm::Value *UB = CGF.EmitLoadOfScalar (
3555
+ Src, CGF.ConvertTypeForMem (CGFContext.getPointerType (RecordPointerTy)));
3556
+ llvm::Value *Arg = CGF.EmitLoadOfScalar (
3531
3557
TypedAddress,
3532
- /* Volatile=*/ false ,
3533
- CGF.getContext ().getPointerType (CGF.getContext ().getSizeType ()),
3534
- cast<OMPLoopDirective>(D).getUpperBoundVariable ()->getExprLoc ());
3535
- Args.emplace_back (UB);
3536
- ++Idx;
3537
- }
3538
- if (CS.capture_size () > 0 ) {
3558
+ /* Volatile=*/ false , CGFContext.getPointerType (RecordPointerTy),
3559
+ CI->getLocation ());
3560
+ Args.emplace_back (Arg);
3561
+ } else {
3562
+ // If there are no captured arguments, use nullptr.
3539
3563
ASTContext &CGFContext = CGF.getContext ();
3540
- for (unsigned I = 0 , E = CS.capture_size (); I < E; ++I, ++CI, ++CurField) {
3541
- QualType ElemTy = CurField->getType ();
3542
- Address Src = Bld.CreateConstInBoundsGEP (SharedArgListAddress, I + Idx);
3543
- Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast (
3544
- Src, CGF.ConvertTypeForMem (CGFContext.getPointerType (ElemTy)));
3545
- llvm::Value *Arg = CGF.EmitLoadOfScalar (TypedAddress,
3546
- /* Volatile=*/ false ,
3547
- CGFContext.getPointerType (ElemTy),
3548
- CI->getLocation ());
3549
- if (CI->capturesVariableByCopy () &&
3550
- !CI->getCapturedVar ()->getType ()->isAnyPointerType ()) {
3551
- Arg = castValueToType (CGF, Arg, ElemTy, CGFContext.getUIntPtrType (),
3552
- CI->getLocation ());
3553
- }
3554
- Args.emplace_back (Arg);
3555
- }
3564
+ QualType RecordPointerTy =
3565
+ CGFContext.getPointerType (CGFContext.getRecordType (RD));
3566
+ llvm::Value *Arg =
3567
+ llvm::Constant::getNullValue (CGF.ConvertTypeForMem (RecordPointerTy));
3568
+ Args.emplace_back (Arg);
3556
3569
}
3557
3570
3558
3571
emitOutlinedFunctionCall (CGF, D.getBeginLoc (), OutlinedParallelFn, Args);
0 commit comments