Skip to content

Commit 92b0752

Browse files
[MemProf] Support cloning through recursive cycles (#127429)
In order to facilitate cloning of recursive cycles, we first identify backedges using a standard DFS search from the root callers, then initially defer recursively invoking the cloning function via those edges. This is because the cloning opportunity along the backedge may not be exposed until the current node is cloned for other non-backedge callers that are cold after the earlier recursive cloning, resulting in a cold predecessor of the backedge. So we recursively invoke the cloning function for the backedges during the cloning of the current node for its caller edges (which were sorted to enable handling cold callers first). There was no significant time or memory overhead measured for several large applications.
1 parent 1a6ed4d commit 92b0752

File tree

3 files changed

+232
-45
lines changed

3 files changed

+232
-45
lines changed

llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Lines changed: 203 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ STATISTIC(FoundProfiledCalleeMaxDepth,
8989
"Maximum depth of profiled callees found via tail calls");
9090
STATISTIC(FoundProfiledCalleeNonUniquelyCount,
9191
"Number of profiled callees found via multiple tail call chains");
92+
STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
9293

9394
static cl::opt<std::string> DotFilePathPrefix(
9495
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -127,14 +128,18 @@ static cl::opt<bool> AllowRecursiveCallsites(
127128
"memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
128129
cl::desc("Allow cloning of callsites involved in recursive cycles"));
129130

131+
static cl::opt<bool> CloneRecursiveContexts(
132+
"memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
133+
cl::desc("Allow cloning of contexts through recursive cycles"));
134+
130135
// When disabled, try to detect and prevent cloning of recursive contexts.
131136
// This is only necessary until we support cloning through recursive cycles.
132137
// Leave on by default for now, as disabling requires a little bit of compile
133138
// time overhead and doesn't affect correctness, it will just inflate the cold
134139
// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
135140
static cl::opt<bool> AllowRecursiveContexts(
136141
"memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
137-
cl::desc("Allow cloning of contexts through recursive cycles"));
142+
cl::desc("Allow cloning of contexts having recursive cycles"));
138143

139144
namespace llvm {
140145
cl::opt<bool> EnableMemProfContextDisambiguation(
@@ -293,37 +298,40 @@ class CallsiteContextGraph {
293298
// TODO: Should this be a map (from Caller node) for more efficient lookup?
294299
std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
295300

296-
// Get the list of edges from which we can compute allocation information
297-
// such as the context ids and allocation type of this node.
298-
const std::vector<std::shared_ptr<ContextEdge>> *
299-
getEdgesWithAllocInfo() const {
300-
// If node has any callees, compute from those, otherwise compute from
301-
// callers (i.e. if this is the leaf allocation node).
302-
if (!CalleeEdges.empty())
303-
return &CalleeEdges;
301+
// Returns true if we need to look at the callee edges for determining the
302+
// node context ids and allocation type.
303+
bool useCallerEdgesForContextInfo() const {
304304
// Typically if the callee edges are empty either the caller edges are
305305
// also empty, or this is an allocation (leaf node). However, if we are
306306
// allowing recursive callsites and contexts this will be violated for
307307
// incompletely cloned recursive cycles.
308-
assert(CallerEdges.empty() || IsAllocation ||
308+
assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
309309
(AllowRecursiveCallsites && AllowRecursiveContexts));
310-
if (!CallerEdges.empty() && IsAllocation)
311-
return &CallerEdges;
312-
return nullptr;
310+
// When cloning for a recursive context, during cloning we might be in the
311+
// midst of cloning for a recurrence and have moved context ids off of a
312+
// caller edge onto the clone but not yet off of the incoming caller
313+
// (back) edge. If we don't look at those we miss the fact that this node
314+
// still has context ids of interest.
315+
return IsAllocation || CloneRecursiveContexts;
313316
}
314317

315318
// Compute the context ids for this node from the union of its edge context
316319
// ids.
317320
DenseSet<uint32_t> getContextIds() const {
318-
DenseSet<uint32_t> ContextIds;
319-
auto *Edges = getEdgesWithAllocInfo();
320-
if (!Edges)
321-
return {};
322321
unsigned Count = 0;
323-
for (auto &Edge : *Edges)
322+
// Compute the number of ids for reserve below. In general we only need to
323+
// look at one set of edges, typically the callee edges, since other than
324+
// allocations and in some cases during recursion cloning, all the context
325+
// ids on the callers should also flow out via callee edges.
326+
for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
324327
Count += Edge->getContextIds().size();
328+
DenseSet<uint32_t> ContextIds;
325329
ContextIds.reserve(Count);
326-
for (auto &Edge : *Edges)
330+
auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
331+
CalleeEdges, useCallerEdgesForContextInfo()
332+
? CallerEdges
333+
: std::vector<std::shared_ptr<ContextEdge>>());
334+
for (const auto &Edge : Edges)
327335
ContextIds.insert(Edge->getContextIds().begin(),
328336
Edge->getContextIds().end());
329337
return ContextIds;
@@ -332,13 +340,14 @@ class CallsiteContextGraph {
332340
// Compute the allocation type for this node from the OR of its edge
333341
// allocation types.
334342
uint8_t computeAllocType() const {
335-
auto *Edges = getEdgesWithAllocInfo();
336-
if (!Edges)
337-
return (uint8_t)AllocationType::None;
338343
uint8_t BothTypes =
339344
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
340345
uint8_t AllocType = (uint8_t)AllocationType::None;
341-
for (auto &Edge : *Edges) {
346+
auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
347+
CalleeEdges, useCallerEdgesForContextInfo()
348+
? CallerEdges
349+
: std::vector<std::shared_ptr<ContextEdge>>());
350+
for (const auto &Edge : Edges) {
342351
AllocType |= Edge->AllocTypes;
343352
// Bail early if alloc type reached both, no further refinement.
344353
if (AllocType == BothTypes)
@@ -350,10 +359,11 @@ class CallsiteContextGraph {
350359
// The context ids set for this node is empty if its edge context ids are
351360
// also all empty.
352361
bool emptyContextIds() const {
353-
auto *Edges = getEdgesWithAllocInfo();
354-
if (!Edges)
355-
return true;
356-
for (auto &Edge : *Edges) {
362+
auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
363+
CalleeEdges, useCallerEdgesForContextInfo()
364+
? CallerEdges
365+
: std::vector<std::shared_ptr<ContextEdge>>());
366+
for (const auto &Edge : Edges) {
357367
if (!Edge->getContextIds().empty())
358368
return false;
359369
}
@@ -434,6 +444,14 @@ class CallsiteContextGraph {
434444
// for contexts including this edge.
435445
uint8_t AllocTypes = 0;
436446

447+
// Set just before initiating cloning when cloning of recursive contexts is
448+
// enabled. Used to defer cloning of backedges until we have done cloning of
449+
// the callee node for non-backedge caller edges. This exposes cloning
450+
// opportunities through the backedge of the cycle.
451+
// TODO: Note that this is not updated during cloning, and it is unclear
452+
// whether that would be needed.
453+
bool IsBackedge = false;
454+
437455
// The set of IDs for contexts including this edge.
438456
DenseSet<uint32_t> ContextIds;
439457

@@ -722,6 +740,9 @@ class CallsiteContextGraph {
722740
void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
723741
ContextNode *NewCaller);
724742

743+
void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
744+
DenseSet<const ContextNode *> &CurrentStack);
745+
725746
/// Recursively perform cloning on the graph for the given Node and its
726747
/// callers, in order to uniquely identify the allocation behavior of an
727748
/// allocation given its context. The context ids of the allocation being
@@ -2874,6 +2895,7 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
28742895
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
28752896
raw_ostream &OS) const {
28762897
OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2898+
<< (IsBackedge ? " (BE)" : "")
28772899
<< " AllocTypes: " << getAllocTypeString(AllocTypes);
28782900
OS << " ContextIds:";
28792901
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
@@ -3115,6 +3137,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
31153137
// node (Edge's current callee may be the original node too).
31163138
assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
31173139

3140+
bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3141+
31183142
ContextNode *OldCallee = Edge->Callee;
31193143

31203144
// We might already have an edge to the new callee from earlier cloning for a
@@ -3181,8 +3205,16 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
31813205
// If this is a direct recursion edge, use NewCallee (the clone) as the
31823206
// callee as well, so that any edge updated/created here is also direct
31833207
// recursive.
3184-
if (CalleeToUse == OldCallee)
3208+
if (CalleeToUse == OldCallee) {
3209+
// If this is a recursive edge, see if we already moved a recursive edge
3210+
// (which would have to have been this one) - if we were only moving a
3211+
// subset of context ids it would still be on OldCallee.
3212+
if (EdgeIsRecursive) {
3213+
assert(OldCalleeEdge == Edge);
3214+
continue;
3215+
}
31853216
CalleeToUse = NewCallee;
3217+
}
31863218
// The context ids moving to the new callee are the subset of this edge's
31873219
// context ids and the context ids on the caller edge being moved.
31883220
DenseSet<uint32_t> EdgeContextIdsToMove =
@@ -3369,9 +3401,47 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
33693401
}
33703402
}
33713403

3404+
// This is the standard DFS based backedge discovery algorithm.
3405+
template <typename DerivedCCG, typename FuncTy, typename CallTy>
3406+
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3407+
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3408+
DenseSet<const ContextNode *> &CurrentStack) {
3409+
auto I = Visited.insert(Node);
3410+
// We should only call this for unvisited nodes.
3411+
assert(I.second);
3412+
for (auto &CalleeEdge : Node->CalleeEdges) {
3413+
auto *Callee = CalleeEdge->Callee;
3414+
if (Visited.count(Callee)) {
3415+
// Since this was already visited we need to check if it is currently on
3416+
// the recursive stack in which case it is a backedge.
3417+
if (CurrentStack.count(Callee))
3418+
CalleeEdge->IsBackedge = true;
3419+
continue;
3420+
}
3421+
CurrentStack.insert(Callee);
3422+
markBackedges(Callee, Visited, CurrentStack);
3423+
CurrentStack.erase(Callee);
3424+
}
3425+
}
3426+
33723427
template <typename DerivedCCG, typename FuncTy, typename CallTy>
33733428
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3429+
// If we are cloning recursive contexts, find and mark backedges from all root
3430+
// callers, using the typical DFS based backedge analysis.
33743431
DenseSet<const ContextNode *> Visited;
3432+
if (CloneRecursiveContexts) {
3433+
DenseSet<const ContextNode *> CurrentStack;
3434+
for (auto &Entry : NonAllocationCallToContextNodeMap) {
3435+
auto *Node = Entry.second;
3436+
if (Node->isRemoved())
3437+
continue;
3438+
// It is a root if it doesn't have callers.
3439+
if (!Node->CallerEdges.empty())
3440+
continue;
3441+
markBackedges(Node, Visited, CurrentStack);
3442+
assert(CurrentStack.empty());
3443+
}
3444+
}
33753445
for (auto &Entry : AllocationCallToContextNodeMap) {
33763446
Visited.clear();
33773447
identifyClones(Entry.second, Visited, Entry.second->getContextIds());
@@ -3430,6 +3500,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
34303500
assert(!is_contained(Node->CallerEdges, Edge));
34313501
continue;
34323502
}
3503+
// Defer backedges. See comments further below where these edges are
3504+
// handled during the cloning of this Node.
3505+
if (Edge->IsBackedge) {
3506+
// We should only mark these if cloning recursive contexts, where we
3507+
// need to do this deferral.
3508+
assert(CloneRecursiveContexts);
3509+
continue;
3510+
}
34333511
// Ignore any caller we previously visited via another edge.
34343512
if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
34353513
identifyClones(Edge->Caller, Visited, AllocContextIds);
@@ -3483,6 +3561,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
34833561
assert(Node->AllocTypes != (uint8_t)AllocationType::None);
34843562

34853563
DenseSet<uint32_t> RecursiveContextIds;
3564+
assert(AllowRecursiveContexts || !CloneRecursiveContexts);
34863565
// If we are allowing recursive callsites, but have also disabled recursive
34873566
// contexts, look for context ids that show up in multiple caller edges.
34883567
if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
@@ -3505,6 +3584,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
35053584
// makes it less error-prone.
35063585
auto CallerEdges = Node->CallerEdges;
35073586
for (auto &CallerEdge : CallerEdges) {
3587+
// Skip any that have been removed by an earlier recursive call.
3588+
if (CallerEdge->isRemoved()) {
3589+
assert(!is_contained(Node->CallerEdges, CallerEdge));
3590+
continue;
3591+
}
3592+
assert(CallerEdge->Callee == Node);
3593+
35083594
// See if cloning the prior caller edge left this node with a single alloc
35093595
// type or a single caller. In that case no more cloning of Node is needed.
35103596
if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
@@ -3546,13 +3632,100 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
35463632
//
35473633
// Then check if by cloning node at least one of the callee edges will be
35483634
// disambiguated by splitting out different context ids.
3635+
//
3636+
// However, always do the cloning if this is a backedge, in which case we
3637+
// have not yet cloned along this caller edge.
35493638
assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
35503639
assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3551-
if (allocTypeToUse(CallerAllocTypeForAlloc) ==
3640+
if (!CallerEdge->IsBackedge &&
3641+
allocTypeToUse(CallerAllocTypeForAlloc) ==
35523642
allocTypeToUse(Node->AllocTypes) &&
35533643
allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3554-
CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges))
3644+
CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
35553645
continue;
3646+
}
3647+
3648+
if (CallerEdge->IsBackedge) {
3649+
// We should only mark these if cloning recursive contexts, where we
3650+
// need to do this deferral.
3651+
assert(CloneRecursiveContexts);
3652+
DeferredBackedges++;
3653+
}
3654+
3655+
// If this is a backedge, we now do recursive cloning starting from its
3656+
// caller since we may have moved unambiguous caller contexts to a clone
3657+
// of this Node in a previous iteration of the current loop, giving more
3658+
// opportunity for cloning through the backedge. Because we sorted the
3659+
// caller edges earlier so that cold caller edges are first, we would have
3660+
// visited and cloned this node for any unamibiguously cold non-recursive
3661+
// callers before any ambiguous backedge callers. Note that we don't do this
3662+
// if the caller is already cloned or visited during cloning (e.g. via a
3663+
// different context path from the allocation).
3664+
// TODO: Can we do better in the case where the caller was already visited?
3665+
if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3666+
!Visited.count(CallerEdge->Caller)) {
3667+
const auto OrigIdCount = CallerEdge->getContextIds().size();
3668+
// Now do the recursive cloning of this backedge's caller, which was
3669+
// deferred earlier.
3670+
identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3671+
removeNoneTypeCalleeEdges(CallerEdge->Caller);
3672+
// See if the recursive call to identifyClones moved the context ids to a
3673+
// new edge from this node to a clone of caller, and switch to looking at
3674+
// that new edge so that we clone Node for the new caller clone.
3675+
bool UpdatedEdge = false;
3676+
if (OrigIdCount > CallerEdge->getContextIds().size()) {
3677+
for (auto E : Node->CallerEdges) {
3678+
// Only interested in clones of the current edges caller.
3679+
if (E->Caller->CloneOf != CallerEdge->Caller)
3680+
continue;
3681+
// See if this edge contains any of the context ids originally on the
3682+
// current caller edge.
3683+
auto CallerEdgeContextsForAllocNew =
3684+
set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3685+
if (CallerEdgeContextsForAllocNew.empty())
3686+
continue;
3687+
// Make sure we don't pick a previously existing caller edge of this
3688+
// Node, which would be processed on a different iteration of the
3689+
// outer loop over the saved CallerEdges.
3690+
if (std::find(CallerEdges.begin(), CallerEdges.end(), E) !=
3691+
CallerEdges.end())
3692+
continue;
3693+
// The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3694+
// are updated further below for all cases where we just invoked
3695+
// identifyClones recursively.
3696+
CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3697+
CallerEdge = E;
3698+
UpdatedEdge = true;
3699+
break;
3700+
}
3701+
}
3702+
// If cloning removed this edge (and we didn't update it to a new edge
3703+
// above), we're done with this edge. It's possible we moved all of the
3704+
// context ids to an existing clone, in which case there's no need to do
3705+
// further processing for them.
3706+
if (CallerEdge->isRemoved())
3707+
continue;
3708+
3709+
// Now we need to update the information used for the cloning decisions
3710+
// further below, as we may have modified edges and their context ids.
3711+
3712+
// Note if we changed the CallerEdge above we would have already updated
3713+
// the context ids.
3714+
if (!UpdatedEdge) {
3715+
CallerEdgeContextsForAlloc = set_intersection(
3716+
CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3717+
if (CallerEdgeContextsForAlloc.empty())
3718+
continue;
3719+
}
3720+
// Update the other information that depends on the edges and on the now
3721+
// updated CallerEdgeContextsForAlloc.
3722+
CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3723+
CalleeEdgeAllocTypesForCallerEdge.clear();
3724+
for (auto &CalleeEdge : Node->CalleeEdges) {
3725+
CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3726+
CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3727+
}
3728+
}
35563729

35573730
// First see if we can use an existing clone. Check each clone and its
35583731
// callee edges for matching alloc types.

0 commit comments

Comments
 (0)