Skip to content

Commit 1d66649

Browse files
committed
[OpenMP] Codegen aggregate for outlined function captures
Parallel regions are outlined as functions with capture variables explicitly generated as distinct parameters in the function's argument list. That complicates the fork_call interface in the OpenMP runtime: (1) the fork_call is variadic since there is a variable number of arguments to forward to the outlined function, (2) wrapping/unwrapping arguments happens in the OpenMP runtime, which is sub-optimal, has been a source of ABI bugs, and has a hardcoded limit (16) in the number of arguments, (3) forwarded arguments must cast to pointer types, which complicates debugging. This patch avoids those issues by aggregating captured arguments in a struct to pass to the fork_call. Reviewed By: jdoerfert, jhuber6 Differential Revision: https://reviews.llvm.org/D102107
1 parent 2af57b6 commit 1d66649

File tree

212 files changed

+360395
-307028
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

212 files changed

+360395
-307028
lines changed

clang/lib/CodeGen/CGOpenMPRuntime.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1284,7 +1284,7 @@ static llvm::Function *emitParallelOrTeamsOutlinedFunction(
12841284
CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind,
12851285
HasCancel, OutlinedHelperName);
12861286
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
1287-
return CGF.GenerateOpenMPCapturedStmtFunction(*CS, D.getBeginLoc());
1287+
return CGF.GenerateOpenMPCapturedStmtFunctionAggregate(*CS, D.getBeginLoc());
12881288
}
12891289

12901290
llvm::Function *CGOpenMPRuntime::emitParallelOutlinedFunction(

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 61 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1523,21 +1523,49 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
15231523
// TODO: Is that needed?
15241524
CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
15251525

1526+
// Store addresses of global arguments to pass to the parallel call.
15261527
Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
15271528
llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
15281529
"captured_vars_addrs");
1529-
// There's something to share.
1530+
1531+
// Store globalized values to push, pop through the global stack.
1532+
llvm::SmallDenseMap<llvm::Value *, unsigned> GlobalValuesToSizeMap;
15301533
if (!CapturedVars.empty()) {
1531-
// Prepare for parallel region. Indicate the outlined function.
15321534
ASTContext &Ctx = CGF.getContext();
15331535
unsigned Idx = 0;
15341536
for (llvm::Value *V : CapturedVars) {
15351537
Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
15361538
llvm::Value *PtrV;
15371539
if (V->getType()->isIntegerTy())
15381540
PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1539-
else
1540-
PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1541+
else {
1542+
assert(V->getType()->isPointerTy() &&
1543+
"Expected Pointer Type to globalize.");
1544+
// Globalize and store pointer.
1545+
llvm::Type *PtrElemTy = V->getType()->getPointerElementType();
1546+
auto &DL = CGM.getDataLayout();
1547+
unsigned GlobalSize = DL.getTypeAllocSize(PtrElemTy);
1548+
1549+
// Use shared memory to store globalized pointer values, for now this
1550+
// should be the outlined args aggregate struct.
1551+
llvm::Value *GlobalSizeArg[] = {
1552+
llvm::ConstantInt::get(CGM.SizeTy, GlobalSize)};
1553+
llvm::Value *GlobalValue = CGF.EmitRuntimeCall(
1554+
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
1555+
OMPRTL___kmpc_alloc_shared),
1556+
GlobalSizeArg);
1557+
GlobalValuesToSizeMap[GlobalValue] = GlobalSize;
1558+
1559+
llvm::Value *CapturedVarVal = Bld.CreateAlignedLoad(
1560+
PtrElemTy, V, DL.getABITypeAlign(PtrElemTy));
1561+
llvm::Value *GlobalValueCast =
1562+
Bld.CreatePointerBitCastOrAddrSpaceCast(
1563+
GlobalValue, PtrElemTy->getPointerTo());
1564+
Bld.CreateDefaultAlignedStore(CapturedVarVal, GlobalValueCast);
1565+
1566+
PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(GlobalValue,
1567+
CGF.VoidPtrTy);
1568+
}
15411569
CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
15421570
Ctx.getPointerType(Ctx.VoidPtrTy));
15431571
++Idx;
@@ -1550,8 +1578,9 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
15501578
/* isSigned */ false);
15511579
else
15521580
IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1553-
15541581
assert(IfCondVal && "Expected a value");
1582+
1583+
// Create the parallel call.
15551584
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
15561585
llvm::Value *Args[] = {
15571586
RTLoc,
@@ -1567,6 +1596,14 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
15671596
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
15681597
CGM.getModule(), OMPRTL___kmpc_parallel_51),
15691598
Args);
1599+
1600+
// Pop any globalized values from the global stack.
1601+
for (const auto &GV : GlobalValuesToSizeMap) {
1602+
CGF.EmitRuntimeCall(
1603+
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
1604+
OMPRTL___kmpc_free_shared),
1605+
{GV.first, llvm::ConstantInt::get(CGM.SizeTy, GV.second)});
1606+
}
15701607
};
15711608

15721609
RegionCodeGenTy RCG(ParallelGen);
@@ -3477,7 +3514,6 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
34773514
D.getBeginLoc(), D.getBeginLoc());
34783515

34793516
const auto *RD = CS.getCapturedRecordDecl();
3480-
auto CurField = RD->field_begin();
34813517

34823518
Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
34833519
/*Name=*/".zero.addr");
@@ -3489,7 +3525,6 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
34893525
Args.emplace_back(ZeroAddr.getPointer());
34903526

34913527
CGBuilderTy &Bld = CGF.Builder;
3492-
auto CI = CS.capture_begin();
34933528

34943529
// Use global memory for data sharing.
34953530
// Handle passing of global args to workers.
@@ -3504,55 +3539,33 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
35043539
// Retrieve the shared variables from the list of references returned
35053540
// by the runtime. Pass the variables to the outlined function.
35063541
Address SharedArgListAddress = Address::invalid();
3507-
if (CS.capture_size() > 0 ||
3508-
isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3542+
if (CS.capture_size() > 0) {
35093543
SharedArgListAddress = CGF.EmitLoadOfPointer(
35103544
GlobalArgs, CGF.getContext()
35113545
.getPointerType(CGF.getContext().getPointerType(
35123546
CGF.getContext().VoidPtrTy))
35133547
.castAs<PointerType>());
3514-
}
3515-
unsigned Idx = 0;
3516-
if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3517-
Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3548+
const auto *CI = CS.capture_begin();
3549+
// Load the outlined arg aggregate struct.
3550+
ASTContext &CGFContext = CGF.getContext();
3551+
QualType RecordPointerTy =
3552+
CGFContext.getPointerType(CGFContext.getRecordType(RD));
3553+
Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, /*Index=*/0);
35183554
Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3519-
Src, CGF.SizeTy->getPointerTo());
3520-
llvm::Value *LB = CGF.EmitLoadOfScalar(
3521-
TypedAddress,
3522-
/*Volatile=*/false,
3523-
CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3524-
cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3525-
Args.emplace_back(LB);
3526-
++Idx;
3527-
Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3528-
TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3529-
Src, CGF.SizeTy->getPointerTo());
3530-
llvm::Value *UB = CGF.EmitLoadOfScalar(
3555+
Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(RecordPointerTy)));
3556+
llvm::Value *Arg = CGF.EmitLoadOfScalar(
35313557
TypedAddress,
3532-
/*Volatile=*/false,
3533-
CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3534-
cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3535-
Args.emplace_back(UB);
3536-
++Idx;
3537-
}
3538-
if (CS.capture_size() > 0) {
3558+
/*Volatile=*/false, CGFContext.getPointerType(RecordPointerTy),
3559+
CI->getLocation());
3560+
Args.emplace_back(Arg);
3561+
} else {
3562+
// If there are no captured arguments, use nullptr.
35393563
ASTContext &CGFContext = CGF.getContext();
3540-
for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3541-
QualType ElemTy = CurField->getType();
3542-
Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3543-
Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3544-
Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3545-
llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3546-
/*Volatile=*/false,
3547-
CGFContext.getPointerType(ElemTy),
3548-
CI->getLocation());
3549-
if (CI->capturesVariableByCopy() &&
3550-
!CI->getCapturedVar()->getType()->isAnyPointerType()) {
3551-
Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3552-
CI->getLocation());
3553-
}
3554-
Args.emplace_back(Arg);
3555-
}
3564+
QualType RecordPointerTy =
3565+
CGFContext.getPointerType(CGFContext.getRecordType(RD));
3566+
llvm::Value *Arg =
3567+
llvm::Constant::getNullValue(CGF.ConvertTypeForMem(RecordPointerTy));
3568+
Args.emplace_back(Arg);
35563569
}
35573570

35583571
emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);

0 commit comments

Comments
 (0)