Skip to content

Commit bf6ff4f

Browse files
committed
[MemProf] Context disambiguation cloning pass [patch 3/4]
Applies cloning decisions to the IR, cloning functions and updating calls. For Regular LTO, the IR is updated directly during function assignment, whereas for ThinLTO it is recorded in the summary index (a subsequent patch will apply to the IR via the index during the ThinLTO backend. The function assignment and cloning proceeds greedily, and we create new clones as needed when we find an incompatible assignment of function clones to callsite clones (i.e. when different callers need to invoke different combinations of callsite clones). Depends on D140949. Differential Revision: https://reviews.llvm.org/D141077
1 parent 853d212 commit bf6ff4f

File tree

12 files changed

+1448
-31
lines changed

12 files changed

+1448
-31
lines changed

llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,14 @@ namespace llvm {
2525
class GlobalValueSummary;
2626
class Module;
2727
class ModuleSummaryIndex;
28+
class OptimizationRemarkEmitter;
2829

2930
class MemProfContextDisambiguation
3031
: public PassInfoMixin<MemProfContextDisambiguation> {
3132
/// Run the context disambiguator on \p M, returns true if any changes made.
32-
bool processModule(Module &M);
33+
bool processModule(
34+
Module &M,
35+
function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
3336

3437
public:
3538
MemProfContextDisambiguation() {}

llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Lines changed: 671 additions & 16 deletions
Large diffs are not rendered by default.

llvm/test/ThinLTO/X86/memprof-basic.ll

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,35 @@
3939
; RUN: -r=%t.o,_Znam, \
4040
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
4141
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
42-
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
42+
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
43+
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
44+
; RUN: --check-prefix=STATS
4345

4446
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
4547
;; We should have cloned bar, baz, and foo, for the cold memory allocation.
4648
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
4749

4850

51+
;; Try again but with distributed ThinLTO
52+
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
53+
; RUN: -thinlto-distributed-indexes \
54+
; RUN: -r=%t.o,main,plx \
55+
; RUN: -r=%t.o,_ZdaPv, \
56+
; RUN: -r=%t.o,sleep, \
57+
; RUN: -r=%t.o,_Znam, \
58+
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
59+
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
60+
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
61+
; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
62+
; RUN: --check-prefix=STATS
63+
64+
; RUN: cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
65+
;; We should have cloned bar, baz, and foo, for the cold memory allocation.
66+
; RUN: cat %t2.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
67+
68+
;; Check distributed index
69+
; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
70+
4971
source_filename = "memprof-basic.ll"
5072
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
5173
target triple = "x86_64-unknown-linux-gnu"
@@ -227,6 +249,11 @@ uselistorder ptr @_Z3foov, { 1, 0 }
227249
; DUMP: Clone of [[BAR]]
228250

229251

252+
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
253+
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
254+
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
255+
256+
230257
; DOT: digraph "postbuild" {
231258
; DOT: label="postbuild";
232259
; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
@@ -258,3 +285,9 @@ uselistorder ptr @_Z3foov, { 1, 0 }
258285
; DOTCLONED: Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"];
259286
; DOTCLONED: Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
260287
; DOTCLONED: }
288+
289+
290+
; DISTRIB: ^[[BAZ:[0-9]+]] = gv: (guid: 5878270615442837395, {{.*}} callsites: ((callee: ^[[BAR:[0-9]+]], clones: (0, 1)
291+
; DISTRIB: ^[[FOO:[0-9]+]] = gv: (guid: 6731117468105397038, {{.*}} callsites: ((callee: ^[[BAZ]], clones: (0, 1)
292+
; DISTRIB: ^[[BAR]] = gv: (guid: 9832687305761716512, {{.*}} allocs: ((versions: (notcold, cold)
293+
; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[FOO]], clones: (0), {{.*}} (callee: ^[[FOO]], clones: (1)

llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
;; Test callsite context graph generation for call graph with with MIBs
22
;; that have pruned contexts that partially match multiple inlined
33
;; callsite contexts, requiring duplication of context ids and nodes
4-
;; while matching callsite nodes onto the graph.
4+
;; while matching callsite nodes onto the graph. Also tests graph and IR
5+
;; cloning.
56
;;
67
;; Original code looks like:
78
;;
@@ -60,14 +61,37 @@
6061
; RUN: -r=%t.o,_Znam, \
6162
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
6263
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
63-
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
64+
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
65+
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
66+
; RUN: --check-prefix=STATS
6467

6568
; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
6669
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
6770
;; We should clone D once for the cold allocations via C.
6871
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
6972

7073

74+
;; Try again but with distributed ThinLTO
75+
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
76+
; RUN: -thinlto-distributed-indexes \
77+
; RUN: -r=%t.o,main,plx \
78+
; RUN: -r=%t.o,_ZdaPv, \
79+
; RUN: -r=%t.o,sleep, \
80+
; RUN: -r=%t.o,_Znam, \
81+
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
82+
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
83+
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
84+
; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
85+
; RUN: --check-prefix=STATS
86+
87+
; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
88+
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
89+
;; We should clone D once for the cold allocations via C.
90+
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
91+
92+
;; Check distributed index
93+
; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
94+
7195
source_filename = "duplicate-context-ids.ll"
7296
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
7397
target triple = "x86_64-unknown-linux-gnu"
@@ -104,7 +128,13 @@ entry:
104128
ret ptr null
105129
}
106130

107-
declare i32 @main()
131+
define i32 @main() {
132+
entry:
133+
call ptr @_Z1Bv()
134+
call ptr @_Z1Ev()
135+
call ptr @_Z1Fv()
136+
ret i32 0
137+
}
108138

109139
declare void @_ZdaPv()
110140

@@ -268,6 +298,11 @@ declare i32 @sleep()
268298
; DUMP: Clone of [[D]]
269299

270300

301+
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
302+
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
303+
; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
304+
305+
271306
; DOTPRE: digraph "prestackupdate" {
272307
; DOTPRE: label="prestackupdate";
273308
; DOTPRE: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
@@ -305,3 +340,9 @@ declare i32 @sleep()
305340
; DOTCLONED: Node[[E]] -> Node[[D2]][tooltip="ContextIds: 1",fillcolor="cyan"];
306341
; DOTCLONED: Node[[D2]] [shape=record,tooltip="N[[D2]] ContextIds: 1 3 4",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
307342
; DOTCLONED: }
343+
344+
; DISTRIB: ^[[C:[0-9]+]] = gv: (guid: 1643923691937891493, {{.*}} callsites: ((callee: ^[[D:[0-9]+]], clones: (1)
345+
; DISTRIB: ^[[D]] = gv: (guid: 4881081444663423788, {{.*}} allocs: ((versions: (notcold, cold)
346+
; DISTRIB: ^[[B:[0-9]+]] = gv: (guid: 14590037969532473829, {{.*}} callsites: ((callee: ^[[D]], clones: (1)
347+
; DISTRIB: ^[[F:[0-9]+]] = gv: (guid: 17035303613541779335, {{.*}} callsites: ((callee: ^[[D]], clones: (0)
348+
; DISTRIB: ^[[E:[0-9]+]] = gv: (guid: 17820708772846654376, {{.*}} callsites: ((callee: ^[[D]], clones: (1)
Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
;; Test context disambiguation for a callgraph containing multiple memprof
2+
;; contexts and no inlining, where we need to perform additional cloning
3+
;; during function assignment/cloning to handle the combination of contexts
4+
;; to 2 different allocations.
5+
;;
6+
;; void E(char **buf1, char **buf2) {
7+
;; *buf1 = new char[10];
8+
;; *buf2 = new char[10];
9+
;; }
10+
;;
11+
;; void B(char **buf1, char **buf2) {
12+
;; E(buf1, buf2);
13+
;; }
14+
;;
15+
;; void C(char **buf1, char **buf2) {
16+
;; E(buf1, buf2);
17+
;; }
18+
;;
19+
;; void D(char **buf1, char **buf2) {
20+
;; E(buf1, buf2);
21+
;; }
22+
;; int main(int argc, char **argv) {
23+
;; char *cold1, *cold2, *default1, *default2, *default3, *default4;
24+
;; B(&default1, &default2);
25+
;; C(&default3, &cold1);
26+
;; D(&cold2, &default4);
27+
;; memset(cold1, 0, 10);
28+
;; memset(cold2, 0, 10);
29+
;; memset(default1, 0, 10);
30+
;; memset(default2, 0, 10);
31+
;; memset(default3, 0, 10);
32+
;; memset(default4, 0, 10);
33+
;; delete[] default1;
34+
;; delete[] default2;
35+
;; delete[] default3;
36+
;; delete[] default4;
37+
;; sleep(10);
38+
;; delete[] cold1;
39+
;; delete[] cold2;
40+
;; return 0;
41+
;; }
42+
;;
43+
;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
44+
;; memory freed after sleep(10) results in cold lifetimes.
45+
;;
46+
;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
47+
48+
49+
; RUN: opt -thinlto-bc %s >%t.o
50+
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
51+
; RUN: -r=%t.o,main,plx \
52+
; RUN: -r=%t.o,_ZdaPv, \
53+
; RUN: -r=%t.o,sleep, \
54+
; RUN: -r=%t.o,_Znam, \
55+
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
56+
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
57+
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
58+
; RUN: --check-prefix=STATS
59+
60+
61+
;; Try again but with distributed ThinLTO
62+
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
63+
; RUN: -thinlto-distributed-indexes \
64+
; RUN: -r=%t.o,main,plx \
65+
; RUN: -r=%t.o,_ZdaPv, \
66+
; RUN: -r=%t.o,sleep, \
67+
; RUN: -r=%t.o,_Znam, \
68+
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
69+
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
70+
; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
71+
; RUN: --check-prefix=STATS
72+
73+
74+
source_filename = "funcassigncloning.ll"
75+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
76+
target triple = "x86_64-unknown-linux-gnu"
77+
78+
; Function Attrs: noinline optnone
79+
define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) {
80+
entry:
81+
%call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7
82+
%call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15
83+
ret void
84+
}
85+
86+
declare ptr @_Znam(i64)
87+
88+
define internal void @_Z1BPPcS0_() {
89+
entry:
90+
call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16
91+
ret void
92+
}
93+
94+
define internal void @_Z1CPPcS0_() {
95+
entry:
96+
call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17
97+
ret void
98+
}
99+
100+
define internal void @_Z1DPPcS0_() {
101+
entry:
102+
call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18
103+
ret void
104+
}
105+
106+
; Function Attrs: noinline optnone
107+
define i32 @main() {
108+
entry:
109+
call void @_Z1BPPcS0_()
110+
call void @_Z1CPPcS0_()
111+
call void @_Z1DPPcS0_()
112+
ret i32 0
113+
}
114+
115+
declare void @_ZdaPv()
116+
117+
declare i32 @sleep()
118+
119+
; uselistorder directives
120+
uselistorder ptr @_Znam, { 1, 0 }
121+
122+
!0 = !{!1, !3, !5}
123+
!1 = !{!2, !"cold"}
124+
!2 = !{i64 -3461278137325233666, i64 -7799663586031895603}
125+
!3 = !{!4, !"notcold"}
126+
!4 = !{i64 -3461278137325233666, i64 -3483158674395044949}
127+
!5 = !{!6, !"notcold"}
128+
!6 = !{i64 -3461278137325233666, i64 -2441057035866683071}
129+
!7 = !{i64 -3461278137325233666}
130+
!8 = !{!9, !11, !13}
131+
!9 = !{!10, !"notcold"}
132+
!10 = !{i64 -1415475215210681400, i64 -2441057035866683071}
133+
!11 = !{!12, !"cold"}
134+
!12 = !{i64 -1415475215210681400, i64 -3483158674395044949}
135+
!13 = !{!14, !"notcold"}
136+
!14 = !{i64 -1415475215210681400, i64 -7799663586031895603}
137+
!15 = !{i64 -1415475215210681400}
138+
!16 = !{i64 -2441057035866683071}
139+
!17 = !{i64 -3483158674395044949}
140+
!18 = !{i64 -7799663586031895603}
141+
142+
143+
;; Originally we create a single clone of each call to new from E, since each
144+
;; allocates cold memory for a single caller.
145+
146+
; DUMP: CCG after cloning:
147+
; DUMP: Callsite Context Graph:
148+
; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]]
149+
; DUMP: Versions: 1 MIB:
150+
; DUMP: AllocType 2 StackIds: 0
151+
; DUMP: AllocType 1 StackIds: 1
152+
; DUMP: AllocType 1 StackIds: 2
153+
; DUMP: (clone 0)
154+
; DUMP: AllocTypes: NotCold
155+
; DUMP: ContextIds: 2 3
156+
; DUMP: CalleeEdges:
157+
; DUMP: CallerEdges:
158+
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
159+
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
160+
; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]]
161+
162+
; DUMP: Node [[D:0x[a-z0-9]+]]
163+
; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 0 (clone 0)
164+
; DUMP: AllocTypes: NotColdCold
165+
; DUMP: ContextIds: 1 6
166+
; DUMP: CalleeEdges:
167+
; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
168+
; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
169+
; DUMP: CallerEdges:
170+
171+
; DUMP: Node [[C]]
172+
; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 1 (clone 0)
173+
; DUMP: AllocTypes: NotColdCold
174+
; DUMP: ContextIds: 2 5
175+
; DUMP: CalleeEdges:
176+
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2
177+
; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
178+
; DUMP: CallerEdges:
179+
180+
; DUMP: Node [[B]]
181+
; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 2 (clone 0)
182+
; DUMP: AllocTypes: NotCold
183+
; DUMP: ContextIds: 3 4
184+
; DUMP: CalleeEdges:
185+
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3
186+
; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
187+
; DUMP: CallerEdges:
188+
189+
; DUMP: Node [[ENEW2ORIG]]
190+
; DUMP: Versions: 1 MIB:
191+
; DUMP: AllocType 1 StackIds: 2
192+
; DUMP: AllocType 2 StackIds: 1
193+
; DUMP: AllocType 1 StackIds: 0
194+
; DUMP: (clone 0)
195+
; DUMP: AllocTypes: NotCold
196+
; DUMP: ContextIds: 4 6
197+
; DUMP: CalleeEdges:
198+
; DUMP: CallerEdges:
199+
; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
200+
; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
201+
; DUMP: Clones: [[ENEW2CLONE]]
202+
203+
; DUMP: Node [[ENEW1CLONE]]
204+
; DUMP: Versions: 1 MIB:
205+
; DUMP: AllocType 2 StackIds: 0
206+
; DUMP: AllocType 1 StackIds: 1
207+
; DUMP: AllocType 1 StackIds: 2
208+
; DUMP: (clone 0)
209+
; DUMP: AllocTypes: Cold
210+
; DUMP: ContextIds: 1
211+
; DUMP: CalleeEdges:
212+
; DUMP: CallerEdges:
213+
; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
214+
; DUMP: Clone of [[ENEW1ORIG]]
215+
216+
; DUMP: Node [[ENEW2CLONE]]
217+
; DUMP: Versions: 1 MIB:
218+
; DUMP: AllocType 1 StackIds: 2
219+
; DUMP: AllocType 2 StackIds: 1
220+
; DUMP: AllocType 1 StackIds: 0
221+
; DUMP: (clone 0)
222+
; DUMP: AllocTypes: Cold
223+
; DUMP: ContextIds: 5
224+
; DUMP: CalleeEdges:
225+
; DUMP: CallerEdges:
226+
; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
227+
; DUMP: Clone of [[ENEW2ORIG]]
228+
229+
230+
; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
231+
; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
232+
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis

0 commit comments

Comments
 (0)