Skip to content

Commit 9483ff9

Browse files
Reapply "[MemProf] Streamline and avoid unnecessary context id duplication (#107918)" (#110036)
This reverts commit 12d4769, reapplying 524a028 but with fixes for failures seen in broader testing.
1 parent bfe2994 commit 9483ff9

File tree

2 files changed

+173
-35
lines changed

2 files changed

+173
-35
lines changed

llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Lines changed: 71 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1377,9 +1377,12 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
13771377
// Compute the last node's context ids once, as it is shared by all calls in
13781378
// this entry.
13791379
DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1380-
assert(!LastNodeContextIds.empty());
13811380

1382-
for (unsigned I = 0; I < Calls.size(); I++) {
1381+
bool PrevIterCreatedNode = false;
1382+
bool CreatedNode = false;
1383+
for (unsigned I = 0; I < Calls.size();
1384+
I++, PrevIterCreatedNode = CreatedNode) {
1385+
CreatedNode = false;
13831386
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
13841387
// Skip any for which we didn't assign any ids, these don't get a node in
13851388
// the graph.
@@ -1391,7 +1394,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
13911394
if (!CallToMatchingCall.contains(Call))
13921395
continue;
13931396
auto MatchingCall = CallToMatchingCall[Call];
1394-
assert(NonAllocationCallToContextNodeMap.contains(MatchingCall));
1397+
if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1398+
// This should only happen if we had a prior iteration, and it didn't
1399+
// create a node because of the below recomputation of context ids
1400+
// finding none remaining and continuing early.
1401+
assert(I > 0 && !PrevIterCreatedNode);
1402+
continue;
1403+
}
13951404
NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
13961405
Call);
13971406
continue;
@@ -1444,6 +1453,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
14441453
ContextNode *NewNode = NodeOwner.back().get();
14451454
NodeToCallingFunc[NewNode] = Func;
14461455
NonAllocationCallToContextNodeMap[Call] = NewNode;
1456+
CreatedNode = true;
14471457
NewNode->AllocTypes = computeAllocType(SavedContextIds);
14481458

14491459
ContextNode *FirstNode = getNodeForStackId(Ids[0]);
@@ -1548,13 +1558,23 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
15481558
// of length, and within each length, lexicographically by stack id. The
15491559
// latter is so that we can specially handle calls that have identical stack
15501560
// id sequences (either due to cloning or artificially because of the MIB
1551-
// context pruning).
1552-
std::stable_sort(Calls.begin(), Calls.end(),
1553-
[](const CallContextInfo &A, const CallContextInfo &B) {
1554-
return A.StackIds.size() > B.StackIds.size() ||
1555-
(A.StackIds.size() == B.StackIds.size() &&
1556-
A.StackIds < B.StackIds);
1557-
});
1561+
// context pruning). Those with the same Ids are then sorted by function to
1562+
// facilitate efficiently mapping them to the same context node.
1563+
// Because the functions are pointers, to ensure a stable sort first assign
1564+
// each function pointer to its first index in the Calls array, and then use
1565+
// that to sort by.
1566+
DenseMap<const FuncTy *, unsigned> FuncToIndex;
1567+
for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1568+
FuncToIndex.insert({CallCtxInfo.Func, Idx});
1569+
std::stable_sort(
1570+
Calls.begin(), Calls.end(),
1571+
[&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1572+
return A.StackIds.size() > B.StackIds.size() ||
1573+
(A.StackIds.size() == B.StackIds.size() &&
1574+
(A.StackIds < B.StackIds ||
1575+
(A.StackIds == B.StackIds &&
1576+
FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1577+
});
15581578

15591579
// Find the node for the last stack id, which should be the same
15601580
// across all calls recorded for this id, and is the id for this
@@ -1572,18 +1592,26 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
15721592
DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
15731593
assert(!LastNodeContextIds.empty());
15741594

1575-
// Map from function to the first call from the below list (with matching
1576-
// stack ids) found in that function. Note that calls from different
1577-
// functions can have the same stack ids because this is the list of stack
1578-
// ids that had (possibly pruned) nodes after building the graph from the
1579-
// allocation MIBs.
1580-
DenseMap<const FuncTy *, CallInfo> FuncToCallMap;
1595+
#ifndef NDEBUG
1596+
// Save the set of functions seen for a particular set of the same stack
1597+
// ids. This is used to ensure that they have been correctly sorted to be
1598+
// adjacent in the Calls list, since we rely on that to efficiently place
1599+
// all such matching calls onto the same context node.
1600+
DenseSet<const FuncTy *> MatchingIdsFuncSet;
1601+
#endif
15811602

15821603
for (unsigned I = 0; I < Calls.size(); I++) {
15831604
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
15841605
assert(SavedContextIds.empty());
15851606
assert(LastId == Ids.back());
15861607

1608+
#ifndef NDEBUG
1609+
// If this call has a different set of ids than the last one, clear the
1610+
// set used to ensure they are sorted properly.
1611+
if (I > 0 && Ids != Calls[I - 1].StackIds)
1612+
MatchingIdsFuncSet.clear();
1613+
#endif
1614+
15871615
// First compute the context ids for this stack id sequence (the
15881616
// intersection of the context ids of the corresponding nodes).
15891617
// Start with the remaining saved ids for the last node.
@@ -1652,23 +1680,38 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
16521680
continue;
16531681
}
16541682

1655-
// If the prior call had the same stack ids this map would not be empty.
1683+
#ifndef NDEBUG
1684+
// If the prior call had the same stack ids this set would not be empty.
16561685
// Check if we already have a call that "matches" because it is located
1657-
// in the same function.
1658-
if (FuncToCallMap.contains(Func)) {
1659-
// Record the matching call found for this call, and skip it. We
1660-
// will subsequently combine it into the same node.
1661-
CallToMatchingCall[Call] = FuncToCallMap[Func];
1662-
continue;
1663-
}
1686+
// in the same function. If the Calls list was sorted properly we should
1687+
// not encounter this situation as all such entries should be adjacent
1688+
// and processed in bulk further below.
1689+
assert(!MatchingIdsFuncSet.contains(Func));
1690+
1691+
MatchingIdsFuncSet.insert(Func);
1692+
#endif
16641693

16651694
// Check if the next set of stack ids is the same (since the Calls vector
16661695
// of tuples is sorted by the stack ids we can just look at the next one).
1696+
// If so, save them in the CallToMatchingCall map so that they get
1697+
// assigned to the same context node, and skip them.
16671698
bool DuplicateContextIds = false;
1668-
if (I + 1 < Calls.size()) {
1669-
auto &CallCtxInfo = Calls[I + 1];
1699+
for (unsigned J = I + 1; J < Calls.size(); J++) {
1700+
auto &CallCtxInfo = Calls[J];
16701701
auto &NextIds = CallCtxInfo.StackIds;
1671-
DuplicateContextIds = Ids == NextIds;
1702+
if (NextIds != Ids)
1703+
break;
1704+
auto *NextFunc = CallCtxInfo.Func;
1705+
if (NextFunc != Func) {
1706+
// We have another Call with the same ids but that cannot share this
1707+
// node, must duplicate ids for it.
1708+
DuplicateContextIds = true;
1709+
break;
1710+
}
1711+
auto &NextCall = CallCtxInfo.Call;
1712+
CallToMatchingCall[NextCall] = Call;
1713+
// Update I so that it gets incremented correctly to skip this call.
1714+
I = J;
16721715
}
16731716

16741717
// If we don't have duplicate context ids, then we can assign all the
@@ -1692,14 +1735,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
16921735
set_subtract(LastNodeContextIds, StackSequenceContextIds);
16931736
if (LastNodeContextIds.empty())
16941737
break;
1695-
// No longer possibly in a sequence of calls with duplicate stack ids,
1696-
// clear the map.
1697-
FuncToCallMap.clear();
1698-
} else
1699-
// Record the call with its function, so we can locate it the next time
1700-
// we find a call from this function when processing the calls with the
1701-
// same stack ids.
1702-
FuncToCallMap[Func] = Call;
1738+
}
17031739
}
17041740
}
17051741

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
;; This test ensures that the logic which assigns calls to stack nodes
2+
;; correctly handles a case where multiple nodes have stack ids that
3+
;; overlap with each other but have different last nodes (can happen with
4+
;; inlining into various levels of a call chain). Specifically, when we
5+
;; have one that is duplicated (e.g. unrolling), we need to correctly
6+
;; handle the case where the context id has already been assigned to
7+
;; a different call with a different last node.
8+
9+
;; -stats requires asserts
10+
; REQUIRES: asserts
11+
12+
; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
13+
; RUN: -memprof-verify-ccg -memprof-verify-nodes \
14+
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
15+
; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=IR \
16+
; RUN: --check-prefix=STATS --check-prefix=REMARKS
17+
18+
; REMARKS: created clone _Z1Ab.memprof.1
19+
; REMARKS: created clone _Z3XZNv.memprof.1
20+
; REMARKS: call in clone main assigned to call function clone _Z3XZNv.memprof.1
21+
;; Make sure the inlined context in _Z3XZNv, which partially overlaps the stack
22+
;; ids in the shorter inlined context of Z2XZv, correctly calls a cloned
23+
;; version of Z1Ab, which will call the cold annotated allocation.
24+
; REMARKS: call in clone _Z3XZNv.memprof.1 assigned to call function clone _Z1Ab.memprof.1
25+
; REMARKS: call in clone _Z1Ab.memprof.1 marked with memprof allocation attribute cold
26+
; REMARKS: call in clone main assigned to call function clone _Z3XZNv
27+
; REMARKS: call in clone _Z3XZNv assigned to call function clone _Z1Ab
28+
; REMARKS: call in clone _Z1Ab marked with memprof allocation attribute notcold
29+
30+
31+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
32+
target triple = "x86_64-unknown-linux-gnu"
33+
34+
define dso_local void @_Z1Ab() {
35+
entry:
36+
%call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #1, !memprof !0, !callsite !5
37+
ret void
38+
}
39+
40+
; Function Attrs: nobuiltin
41+
declare ptr @_Znam(i64) #0
42+
43+
;; Inlining of stack id 2 into 3. Assume this is called from somewhere else.
44+
define dso_local void @_Z2XZv() local_unnamed_addr #0 {
45+
entry:
46+
;; Simulate duplication of the callsite (e.g. unrolling).
47+
call void @_Z1Ab(), !callsite !6
48+
call void @_Z1Ab(), !callsite !6
49+
ret void
50+
}
51+
52+
;; Inlining of stack id 2 into 3 into 4. Called by main below.
53+
define dso_local void @_Z3XZNv() local_unnamed_addr {
54+
entry:
55+
call void @_Z1Ab(), !callsite !7
56+
ret void
57+
}
58+
59+
define dso_local noundef i32 @main() local_unnamed_addr {
60+
entry:
61+
call void @_Z3XZNv(), !callsite !8 ;; Not cold context
62+
call void @_Z3XZNv(), !callsite !9 ;; Cold context
63+
ret i32 0
64+
}
65+
66+
attributes #0 = { nobuiltin }
67+
attributes #7 = { builtin }
68+
69+
!0 = !{!1, !3}
70+
;; Not cold context via first call to _Z3XZNv in main
71+
!1 = !{!2, !"notcold"}
72+
!2 = !{i64 1, i64 2, i64 3, i64 4, i64 5}
73+
;; Cold context via second call to _Z3XZNv in main
74+
!3 = !{!4, !"cold"}
75+
!4 = !{i64 1, i64 2, i64 3, i64 4, i64 6}
76+
!5 = !{i64 1}
77+
!6 = !{i64 2, i64 3}
78+
!7 = !{i64 2, i64 3, i64 4}
79+
!8 = !{i64 5}
80+
!9 = !{i64 6}
81+
82+
; IR: define {{.*}} @_Z1Ab()
83+
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
84+
; IR: define {{.*}} @_Z2XZv()
85+
; IR: call {{.*}} @_Z1Ab()
86+
; IR: call {{.*}} @_Z1Ab()
87+
; IR: define {{.*}} @_Z3XZNv()
88+
; IR: call {{.*}} @_Z1Ab()
89+
; IR: define {{.*}} @main()
90+
; IR: call {{.*}} @_Z3XZNv()
91+
; IR: call {{.*}} @_Z3XZNv.memprof.1()
92+
; IR: define {{.*}} @_Z1Ab.memprof.1()
93+
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
94+
; IR: define {{.*}} @_Z3XZNv.memprof.1()
95+
; IR: call {{.*}} @_Z1Ab.memprof.1()
96+
97+
; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
98+
; IR: attributes #[[COLD]] = { "memprof"="cold" }
99+
100+
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
101+
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
102+
; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis

0 commit comments

Comments
 (0)