@@ -89,6 +89,7 @@ STATISTIC(FoundProfiledCalleeMaxDepth,
89
89
" Maximum depth of profiled callees found via tail calls" );
90
90
STATISTIC (FoundProfiledCalleeNonUniquelyCount,
91
91
" Number of profiled callees found via multiple tail call chains" );
92
+ STATISTIC (DeferredBackedges, " Number of backedges with deferred cloning" );
92
93
93
94
static cl::opt<std::string> DotFilePathPrefix (
94
95
" memprof-dot-file-path-prefix" , cl::init(" " ), cl::Hidden,
@@ -127,14 +128,18 @@ static cl::opt<bool> AllowRecursiveCallsites(
127
128
" memprof-allow-recursive-callsites" , cl::init(true ), cl::Hidden,
128
129
cl::desc(" Allow cloning of callsites involved in recursive cycles" ));
129
130
131
+ static cl::opt<bool > CloneRecursiveContexts (
132
+ " memprof-clone-recursive-contexts" , cl::init(true ), cl::Hidden,
133
+ cl::desc(" Allow cloning of contexts through recursive cycles" ));
134
+
130
135
// When disabled, try to detect and prevent cloning of recursive contexts.
131
136
// This is only necessary until we support cloning through recursive cycles.
132
137
// Leave on by default for now, as disabling requires a little bit of compile
133
138
// time overhead and doesn't affect correctness, it will just inflate the cold
134
139
// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
135
140
static cl::opt<bool > AllowRecursiveContexts (
136
141
" memprof-allow-recursive-contexts" , cl::init(true ), cl::Hidden,
137
- cl::desc(" Allow cloning of contexts through recursive cycles" ));
142
+ cl::desc(" Allow cloning of contexts having recursive cycles" ));
138
143
139
144
namespace llvm {
140
145
cl::opt<bool > EnableMemProfContextDisambiguation (
@@ -293,37 +298,40 @@ class CallsiteContextGraph {
293
298
// TODO: Should this be a map (from Caller node) for more efficient lookup?
294
299
std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
295
300
296
- // Get the list of edges from which we can compute allocation information
297
- // such as the context ids and allocation type of this node.
298
- const std::vector<std::shared_ptr<ContextEdge>> *
299
- getEdgesWithAllocInfo () const {
300
- // If node has any callees, compute from those, otherwise compute from
301
- // callers (i.e. if this is the leaf allocation node).
302
- if (!CalleeEdges.empty ())
303
- return &CalleeEdges;
301
+ // Returns true if we need to look at the callee edges for determining the
302
+ // node context ids and allocation type.
303
+ bool useCallerEdgesForContextInfo () const {
304
304
// Typically if the callee edges are empty either the caller edges are
305
305
// also empty, or this is an allocation (leaf node). However, if we are
306
306
// allowing recursive callsites and contexts this will be violated for
307
307
// incompletely cloned recursive cycles.
308
- assert (CallerEdges.empty () || IsAllocation ||
308
+ assert (!CalleeEdges. empty () || CallerEdges.empty () || IsAllocation ||
309
309
(AllowRecursiveCallsites && AllowRecursiveContexts));
310
- if (!CallerEdges.empty () && IsAllocation)
311
- return &CallerEdges;
312
- return nullptr ;
310
+ // When cloning for a recursive context, during cloning we might be in the
311
+ // midst of cloning for a recurrence and have moved context ids off of a
312
+ // caller edge onto the clone but not yet off of the incoming caller
313
+ // (back) edge. If we don't look at those we miss the fact that this node
314
+ // still has context ids of interest.
315
+ return IsAllocation || CloneRecursiveContexts;
313
316
}
314
317
315
318
// Compute the context ids for this node from the union of its edge context
316
319
// ids.
317
320
DenseSet<uint32_t > getContextIds () const {
318
- DenseSet<uint32_t > ContextIds;
319
- auto *Edges = getEdgesWithAllocInfo ();
320
- if (!Edges)
321
- return {};
322
321
unsigned Count = 0 ;
323
- for (auto &Edge : *Edges)
322
+ // Compute the number of ids for reserve below. In general we only need to
323
+ // look at one set of edges, typically the callee edges, since other than
324
+ // allocations and in some cases during recursion cloning, all the context
325
+ // ids on the callers should also flow out via callee edges.
326
+ for (auto &Edge : CalleeEdges.empty () ? CallerEdges : CalleeEdges)
324
327
Count += Edge->getContextIds ().size ();
328
+ DenseSet<uint32_t > ContextIds;
325
329
ContextIds.reserve (Count);
326
- for (auto &Edge : *Edges)
330
+ auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
331
+ CalleeEdges, useCallerEdgesForContextInfo ()
332
+ ? CallerEdges
333
+ : std::vector<std::shared_ptr<ContextEdge>>());
334
+ for (const auto &Edge : Edges)
327
335
ContextIds.insert (Edge->getContextIds ().begin (),
328
336
Edge->getContextIds ().end ());
329
337
return ContextIds;
@@ -332,13 +340,14 @@ class CallsiteContextGraph {
332
340
// Compute the allocation type for this node from the OR of its edge
333
341
// allocation types.
334
342
uint8_t computeAllocType () const {
335
- auto *Edges = getEdgesWithAllocInfo ();
336
- if (!Edges)
337
- return (uint8_t )AllocationType::None;
338
343
uint8_t BothTypes =
339
344
(uint8_t )AllocationType::Cold | (uint8_t )AllocationType::NotCold;
340
345
uint8_t AllocType = (uint8_t )AllocationType::None;
341
- for (auto &Edge : *Edges) {
346
+ auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
347
+ CalleeEdges, useCallerEdgesForContextInfo ()
348
+ ? CallerEdges
349
+ : std::vector<std::shared_ptr<ContextEdge>>());
350
+ for (const auto &Edge : Edges) {
342
351
AllocType |= Edge->AllocTypes ;
343
352
// Bail early if alloc type reached both, no further refinement.
344
353
if (AllocType == BothTypes)
@@ -350,10 +359,11 @@ class CallsiteContextGraph {
350
359
// The context ids set for this node is empty if its edge context ids are
351
360
// also all empty.
352
361
bool emptyContextIds () const {
353
- auto *Edges = getEdgesWithAllocInfo ();
354
- if (!Edges)
355
- return true ;
356
- for (auto &Edge : *Edges) {
362
+ auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
363
+ CalleeEdges, useCallerEdgesForContextInfo ()
364
+ ? CallerEdges
365
+ : std::vector<std::shared_ptr<ContextEdge>>());
366
+ for (const auto &Edge : Edges) {
357
367
if (!Edge->getContextIds ().empty ())
358
368
return false ;
359
369
}
@@ -434,6 +444,14 @@ class CallsiteContextGraph {
434
444
// for contexts including this edge.
435
445
uint8_t AllocTypes = 0 ;
436
446
447
+ // Set just before initiating cloning when cloning of recursive contexts is
448
+ // enabled. Used to defer cloning of backedges until we have done cloning of
449
+ // the callee node for non-backedge caller edges. This exposes cloning
450
+ // opportunities through the backedge of the cycle.
451
+ // TODO: Note that this is not updated during cloning, and it is unclear
452
+ // whether that would be needed.
453
+ bool IsBackedge = false ;
454
+
437
455
// The set of IDs for contexts including this edge.
438
456
DenseSet<uint32_t > ContextIds;
439
457
@@ -722,6 +740,9 @@ class CallsiteContextGraph {
722
740
void moveCalleeEdgeToNewCaller (const std::shared_ptr<ContextEdge> &Edge,
723
741
ContextNode *NewCaller);
724
742
743
+ void markBackedges (ContextNode *Node, DenseSet<const ContextNode *> &Visited,
744
+ DenseSet<const ContextNode *> &CurrentStack);
745
+
725
746
// / Recursively perform cloning on the graph for the given Node and its
726
747
// / callers, in order to uniquely identify the allocation behavior of an
727
748
// / allocation given its context. The context ids of the allocation being
@@ -2874,6 +2895,7 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
2874
2895
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2875
2896
raw_ostream &OS) const {
2876
2897
OS << " Edge from Callee " << Callee << " to Caller: " << Caller
2898
+ << (IsBackedge ? " (BE)" : " " )
2877
2899
<< " AllocTypes: " << getAllocTypeString (AllocTypes);
2878
2900
OS << " ContextIds:" ;
2879
2901
std::vector<uint32_t > SortedIds (ContextIds.begin (), ContextIds.end ());
@@ -3115,6 +3137,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3115
3137
// node (Edge's current callee may be the original node too).
3116
3138
assert (NewCallee->getOrigNode () == Edge->Callee ->getOrigNode ());
3117
3139
3140
+ bool EdgeIsRecursive = Edge->Callee == Edge->Caller ;
3141
+
3118
3142
ContextNode *OldCallee = Edge->Callee ;
3119
3143
3120
3144
// We might already have an edge to the new callee from earlier cloning for a
@@ -3181,8 +3205,16 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3181
3205
// If this is a direct recursion edge, use NewCallee (the clone) as the
3182
3206
// callee as well, so that any edge updated/created here is also direct
3183
3207
// recursive.
3184
- if (CalleeToUse == OldCallee)
3208
+ if (CalleeToUse == OldCallee) {
3209
+ // If this is a recursive edge, see if we already moved a recursive edge
3210
+ // (which would have to have been this one) - if we were only moving a
3211
+ // subset of context ids it would still be on OldCallee.
3212
+ if (EdgeIsRecursive) {
3213
+ assert (OldCalleeEdge == Edge);
3214
+ continue ;
3215
+ }
3185
3216
CalleeToUse = NewCallee;
3217
+ }
3186
3218
// The context ids moving to the new callee are the subset of this edge's
3187
3219
// context ids and the context ids on the caller edge being moved.
3188
3220
DenseSet<uint32_t > EdgeContextIdsToMove =
@@ -3369,9 +3401,47 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3369
3401
}
3370
3402
}
3371
3403
3404
+ // This is the standard DFS based backedge discovery algorithm.
3405
+ template <typename DerivedCCG, typename FuncTy, typename CallTy>
3406
+ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3407
+ ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3408
+ DenseSet<const ContextNode *> &CurrentStack) {
3409
+ auto I = Visited.insert (Node);
3410
+ // We should only call this for unvisited nodes.
3411
+ assert (I.second );
3412
+ for (auto &CalleeEdge : Node->CalleeEdges ) {
3413
+ auto *Callee = CalleeEdge->Callee ;
3414
+ if (Visited.count (Callee)) {
3415
+ // Since this was already visited we need to check if it is currently on
3416
+ // the recursive stack in which case it is a backedge.
3417
+ if (CurrentStack.count (Callee))
3418
+ CalleeEdge->IsBackedge = true ;
3419
+ continue ;
3420
+ }
3421
+ CurrentStack.insert (Callee);
3422
+ markBackedges (Callee, Visited, CurrentStack);
3423
+ CurrentStack.erase (Callee);
3424
+ }
3425
+ }
3426
+
3372
3427
template <typename DerivedCCG, typename FuncTy, typename CallTy>
3373
3428
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3429
+ // If we are cloning recursive contexts, find and mark backedges from all root
3430
+ // callers, using the typical DFS based backedge analysis.
3374
3431
DenseSet<const ContextNode *> Visited;
3432
+ if (CloneRecursiveContexts) {
3433
+ DenseSet<const ContextNode *> CurrentStack;
3434
+ for (auto &Entry : NonAllocationCallToContextNodeMap) {
3435
+ auto *Node = Entry.second ;
3436
+ if (Node->isRemoved ())
3437
+ continue ;
3438
+ // It is a root if it doesn't have callers.
3439
+ if (!Node->CallerEdges .empty ())
3440
+ continue ;
3441
+ markBackedges (Node, Visited, CurrentStack);
3442
+ assert (CurrentStack.empty ());
3443
+ }
3444
+ }
3375
3445
for (auto &Entry : AllocationCallToContextNodeMap) {
3376
3446
Visited.clear ();
3377
3447
identifyClones (Entry.second , Visited, Entry.second ->getContextIds ());
@@ -3430,6 +3500,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3430
3500
assert (!is_contained (Node->CallerEdges , Edge));
3431
3501
continue ;
3432
3502
}
3503
+ // Defer backedges. See comments further below where these edges are
3504
+ // handled during the cloning of this Node.
3505
+ if (Edge->IsBackedge ) {
3506
+ // We should only mark these if cloning recursive contexts, where we
3507
+ // need to do this deferral.
3508
+ assert (CloneRecursiveContexts);
3509
+ continue ;
3510
+ }
3433
3511
// Ignore any caller we previously visited via another edge.
3434
3512
if (!Visited.count (Edge->Caller ) && !Edge->Caller ->CloneOf ) {
3435
3513
identifyClones (Edge->Caller , Visited, AllocContextIds);
@@ -3483,6 +3561,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3483
3561
assert (Node->AllocTypes != (uint8_t )AllocationType::None);
3484
3562
3485
3563
DenseSet<uint32_t > RecursiveContextIds;
3564
+ assert (AllowRecursiveContexts || !CloneRecursiveContexts);
3486
3565
// If we are allowing recursive callsites, but have also disabled recursive
3487
3566
// contexts, look for context ids that show up in multiple caller edges.
3488
3567
if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
@@ -3505,6 +3584,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3505
3584
// makes it less error-prone.
3506
3585
auto CallerEdges = Node->CallerEdges ;
3507
3586
for (auto &CallerEdge : CallerEdges) {
3587
+ // Skip any that have been removed by an earlier recursive call.
3588
+ if (CallerEdge->isRemoved ()) {
3589
+ assert (!is_contained (Node->CallerEdges , CallerEdge));
3590
+ continue ;
3591
+ }
3592
+ assert (CallerEdge->Callee == Node);
3593
+
3508
3594
// See if cloning the prior caller edge left this node with a single alloc
3509
3595
// type or a single caller. In that case no more cloning of Node is needed.
3510
3596
if (hasSingleAllocType (Node->AllocTypes ) || Node->CallerEdges .size () <= 1 )
@@ -3546,13 +3632,100 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3546
3632
//
3547
3633
// Then check if by cloning node at least one of the callee edges will be
3548
3634
// disambiguated by splitting out different context ids.
3635
+ //
3636
+ // However, always do the cloning if this is a backedge, in which case we
3637
+ // have not yet cloned along this caller edge.
3549
3638
assert (CallerEdge->AllocTypes != (uint8_t )AllocationType::None);
3550
3639
assert (Node->AllocTypes != (uint8_t )AllocationType::None);
3551
- if (allocTypeToUse (CallerAllocTypeForAlloc) ==
3640
+ if (!CallerEdge->IsBackedge &&
3641
+ allocTypeToUse (CallerAllocTypeForAlloc) ==
3552
3642
allocTypeToUse (Node->AllocTypes ) &&
3553
3643
allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3554
- CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges ))
3644
+ CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges )) {
3555
3645
continue ;
3646
+ }
3647
+
3648
+ if (CallerEdge->IsBackedge ) {
3649
+ // We should only mark these if cloning recursive contexts, where we
3650
+ // need to do this deferral.
3651
+ assert (CloneRecursiveContexts);
3652
+ DeferredBackedges++;
3653
+ }
3654
+
3655
+ // If this is a backedge, we now do recursive cloning starting from its
3656
+ // caller since we may have moved unambiguous caller contexts to a clone
3657
+ // of this Node in a previous iteration of the current loop, giving more
3658
+ // opportunity for cloning through the backedge. Because we sorted the
3659
+ // caller edges earlier so that cold caller edges are first, we would have
3660
+ // visited and cloned this node for any unamibiguously cold non-recursive
3661
+ // callers before any ambiguous backedge callers. Note that we don't do this
3662
+ // if the caller is already cloned or visited during cloning (e.g. via a
3663
+ // different context path from the allocation).
3664
+ // TODO: Can we do better in the case where the caller was already visited?
3665
+ if (CallerEdge->IsBackedge && !CallerEdge->Caller ->CloneOf &&
3666
+ !Visited.count (CallerEdge->Caller )) {
3667
+ const auto OrigIdCount = CallerEdge->getContextIds ().size ();
3668
+ // Now do the recursive cloning of this backedge's caller, which was
3669
+ // deferred earlier.
3670
+ identifyClones (CallerEdge->Caller , Visited, CallerEdgeContextsForAlloc);
3671
+ removeNoneTypeCalleeEdges (CallerEdge->Caller );
3672
+ // See if the recursive call to identifyClones moved the context ids to a
3673
+ // new edge from this node to a clone of caller, and switch to looking at
3674
+ // that new edge so that we clone Node for the new caller clone.
3675
+ bool UpdatedEdge = false ;
3676
+ if (OrigIdCount > CallerEdge->getContextIds ().size ()) {
3677
+ for (auto E : Node->CallerEdges ) {
3678
+ // Only interested in clones of the current edges caller.
3679
+ if (E->Caller ->CloneOf != CallerEdge->Caller )
3680
+ continue ;
3681
+ // See if this edge contains any of the context ids originally on the
3682
+ // current caller edge.
3683
+ auto CallerEdgeContextsForAllocNew =
3684
+ set_intersection (CallerEdgeContextsForAlloc, E->getContextIds ());
3685
+ if (CallerEdgeContextsForAllocNew.empty ())
3686
+ continue ;
3687
+ // Make sure we don't pick a previously existing caller edge of this
3688
+ // Node, which would be processed on a different iteration of the
3689
+ // outer loop over the saved CallerEdges.
3690
+ if (std::find (CallerEdges.begin (), CallerEdges.end (), E) !=
3691
+ CallerEdges.end ())
3692
+ continue ;
3693
+ // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3694
+ // are updated further below for all cases where we just invoked
3695
+ // identifyClones recursively.
3696
+ CallerEdgeContextsForAlloc.swap (CallerEdgeContextsForAllocNew);
3697
+ CallerEdge = E;
3698
+ UpdatedEdge = true ;
3699
+ break ;
3700
+ }
3701
+ }
3702
+ // If cloning removed this edge (and we didn't update it to a new edge
3703
+ // above), we're done with this edge. It's possible we moved all of the
3704
+ // context ids to an existing clone, in which case there's no need to do
3705
+ // further processing for them.
3706
+ if (CallerEdge->isRemoved ())
3707
+ continue ;
3708
+
3709
+ // Now we need to update the information used for the cloning decisions
3710
+ // further below, as we may have modified edges and their context ids.
3711
+
3712
+ // Note if we changed the CallerEdge above we would have already updated
3713
+ // the context ids.
3714
+ if (!UpdatedEdge) {
3715
+ CallerEdgeContextsForAlloc = set_intersection (
3716
+ CallerEdgeContextsForAlloc, CallerEdge->getContextIds ());
3717
+ if (CallerEdgeContextsForAlloc.empty ())
3718
+ continue ;
3719
+ }
3720
+ // Update the other information that depends on the edges and on the now
3721
+ // updated CallerEdgeContextsForAlloc.
3722
+ CallerAllocTypeForAlloc = computeAllocType (CallerEdgeContextsForAlloc);
3723
+ CalleeEdgeAllocTypesForCallerEdge.clear ();
3724
+ for (auto &CalleeEdge : Node->CalleeEdges ) {
3725
+ CalleeEdgeAllocTypesForCallerEdge.push_back (intersectAllocTypes (
3726
+ CalleeEdge->getContextIds (), CallerEdgeContextsForAlloc));
3727
+ }
3728
+ }
3556
3729
3557
3730
// First see if we can use an existing clone. Check each clone and its
3558
3731
// callee edges for matching alloc types.
0 commit comments