Skip to content

Commit 3654183

Browse files
[MemProf] Allow promotion if target is a declaration (#115555)
Fixes an oversight in the MemProf ICP handling, that was blocking promotion/cloning of indirect calls when the profiled target is a declaration (i.e wasn't imported). There is no issue promoting in that case, and in fact the comment mentions we should attempt to at least import as declarations to enable more promotion. Note that normal ICP currently requires that the target be a definition, which is how this check ended up here. The comment there says that it must be a definition because ThinLTO could remove declarations for symbols found to be globally dead in the binary. However, here we are always performing MemProf ICP in the ThinLTO backends, which is after the globally dead symbols are removed (via dropDeadSymbols before starting the optimization pipeline) [1]. For now, guard this with an option (flag is off which means the new promotion is enabled by default) to simplify debugging or disabling it if this proves problematic. [1] In fact we could also be more aggressive in regular ICP when invoked in the ThinLTO backend
1 parent 1d41543 commit 3654183

File tree

2 files changed

+122
-25
lines changed

2 files changed

+122
-25
lines changed

llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,11 @@ cl::opt<bool> EnableMemProfContextDisambiguation(
132132
cl::opt<bool> SupportsHotColdNew(
133133
"supports-hot-cold-new", cl::init(false), cl::Hidden,
134134
cl::desc("Linking with hot/cold operator new interfaces"));
135+
136+
cl::opt<bool> MemProfRequireDefinitionForPromotion(
137+
"memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
138+
cl::desc(
139+
"Require target function definition when promoting indirect calls"));
135140
} // namespace llvm
136141

137142
extern cl::opt<bool> MemProfReportHintedSizes;
@@ -4602,7 +4607,13 @@ void MemProfContextDisambiguation::performICP(
46024607
// target (or version of the code), and we need to be conservative
46034608
// (similar to what is done in the ICP pass).
46044609
Function *TargetFunction = Symtab->getFunction(Candidate.Value);
4605-
if (TargetFunction == nullptr || TargetFunction->isDeclaration()) {
4610+
if (TargetFunction == nullptr ||
4611+
// Any ThinLTO global dead symbol removal should have already
4612+
// occurred, so it should be safe to promote when the target is a
4613+
// declaration.
4614+
// TODO: Remove internal option once more fully tested.
4615+
(MemProfRequireDefinitionForPromotion &&
4616+
TargetFunction->isDeclaration())) {
46064617
ORE.emit([&]() {
46074618
return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
46084619
<< "Memprof cannot promote indirect call: target with md5sum "

llvm/test/ThinLTO/X86/memprof-icp.ll

Lines changed: 110 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@
9393
; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \
9494
; RUN: -r=%t/foo.o,_ZN2B03barEj.abc,plx \
9595
; RUN: -r=%t/foo.o,_Z3xyzR2B0j, \
96+
; RUN: -r=%t/foo.o,_ZN2B03barEj, \
97+
; RUN: -r=%t/foo.o,_ZN1B3barEj, \
9698
; RUN: -r=%t/main.o,_Z3fooR2B0j, \
9799
; RUN: -r=%t/main.o,_Znwm, \
98100
; RUN: -r=%t/main.o,_ZdlPvm, \
@@ -113,9 +115,9 @@
113115
; RUN: -pass-remarks=. -save-temps \
114116
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS \
115117
; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS-MAIN \
116-
; RUN: --check-prefix=REMARKS-FOO
118+
; RUN: --check-prefix=REMARKS-FOO --check-prefix=REMARKS-FOO-IMPORT
117119

118-
; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR
120+
; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR --check-prefix=IR-IMPORT
119121

120122
;; Try again but with distributed ThinLTO
121123
; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
@@ -124,6 +126,8 @@
124126
; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \
125127
; RUN: -r=%t/foo.o,_ZN2B03barEj.abc,plx \
126128
; RUN: -r=%t/foo.o,_Z3xyzR2B0j, \
129+
; RUN: -r=%t/foo.o,_ZN2B03barEj, \
130+
; RUN: -r=%t/foo.o,_ZN1B3barEj, \
127131
; RUN: -r=%t/main.o,_Z3fooR2B0j, \
128132
; RUN: -r=%t/main.o,_Znwm, \
129133
; RUN: -r=%t/main.o,_ZdlPvm, \
@@ -147,8 +151,9 @@
147151
; RUN: -enable-memprof-indirect-call-support=true \
148152
; RUN: -summary-file=%t/foo.o.thinlto.bc -memprof-import-summary=%t/foo.o.thinlto.bc \
149153
; RUN: -enable-import-metadata -stats -pass-remarks=. \
150-
; RUN: %t/foo.o -S 2>&1 | FileCheck %s --check-prefix=IR \
151-
; RUN: --check-prefix=STATS-BE-DISTRIB --check-prefix=REMARKS-FOO
154+
; RUN: %t/foo.o -S 2>&1 | FileCheck %s --check-prefix=IR --check-prefix=IR-IMPORT \
155+
; RUN: --check-prefix=STATS-BE-DISTRIB --check-prefix=REMARKS-FOO \
156+
; RUN: --check-prefix=REMARKS-FOO-IMPORT
152157

153158
;; Retry with the ICP-disabled object file, and make sure we disable it again
154159
;; so we don't look for the synthesized callsite records when applying imports.
@@ -159,6 +164,8 @@
159164
; RUN: -r=%t/foo.noicp.o,_Z3fooR2B0j,plx \
160165
; RUN: -r=%t/foo.noicp.o,_ZN2B03barEj.abc,plx \
161166
; RUN: -r=%t/foo.noicp.o,_Z3xyzR2B0j, \
167+
; RUN: -r=%t/foo.noicp.o,_ZN2B03barEj, \
168+
; RUN: -r=%t/foo.noicp.o,_ZN1B3barEj, \
162169
; RUN: -r=%t/main.o,_Z3fooR2B0j, \
163170
; RUN: -r=%t/main.o,_Znwm, \
164171
; RUN: -r=%t/main.o,_ZdlPvm, \
@@ -184,6 +191,74 @@
184191
;; metadata.
185192
; RUN: llvm-dis %t.noicp.out.2.4.opt.bc -o - | FileCheck %s --implicit-check-not "_Z3fooR2B0j.memprof" --implicit-check-not "!callsite"
186193

194+
;; Run in-process ThinLTO again, but with importing disabled by setting the
195+
;; instruction limit to 0. Ensure that the existing declarations of B::bar
196+
;; and B0::bar are sufficient to allow for the promotion and cloning.
197+
; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
198+
; RUN: -import-instr-limit=0 \
199+
; RUN: -enable-memprof-indirect-call-support=true \
200+
; RUN: -supports-hot-cold-new \
201+
; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \
202+
; RUN: -r=%t/foo.o,_ZN2B03barEj.abc,plx \
203+
; RUN: -r=%t/foo.o,_Z3xyzR2B0j, \
204+
; RUN: -r=%t/foo.o,_ZN2B03barEj, \
205+
; RUN: -r=%t/foo.o,_ZN1B3barEj, \
206+
; RUN: -r=%t/main.o,_Z3fooR2B0j, \
207+
; RUN: -r=%t/main.o,_Znwm, \
208+
; RUN: -r=%t/main.o,_ZdlPvm, \
209+
; RUN: -r=%t/main.o,_Z8externalPi, \
210+
; RUN: -r=%t/main.o,main,plx \
211+
; RUN: -r=%t/main.o,_ZN2B03barEj,plx \
212+
; RUN: -r=%t/main.o,_ZN1B3barEj,plx \
213+
; RUN: -r=%t/main.o,_ZTV1B,plx \
214+
; RUN: -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \
215+
; RUN: -r=%t/main.o,_ZTS1B,plx \
216+
; RUN: -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \
217+
; RUN: -r=%t/main.o,_ZTS2B0,plx \
218+
; RUN: -r=%t/main.o,_ZTI2B0,plx \
219+
; RUN: -r=%t/main.o,_ZTI1B,plx \
220+
; RUN: -r=%t/main.o,_ZTV2B0,plx \
221+
; RUN: -thinlto-threads=1 \
222+
; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \
223+
; RUN: -pass-remarks=. -save-temps \
224+
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS \
225+
; RUN: --check-prefix=STATS-BE-NOIMPORT --check-prefix=REMARKS-MAIN \
226+
; RUN: --check-prefix=REMARKS-FOO
227+
228+
; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR --check-prefix=IR-NOIMPORT
229+
230+
;; Run it gain but with -memprof-require-definition-for-promotion, and confirm
231+
;; that no promotions occur.
232+
; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
233+
; RUN: -import-instr-limit=0 \
234+
; RUN: -memprof-require-definition-for-promotion \
235+
; RUN: -enable-memprof-indirect-call-support=true \
236+
; RUN: -supports-hot-cold-new \
237+
; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \
238+
; RUN: -r=%t/foo.o,_ZN2B03barEj.abc,plx \
239+
; RUN: -r=%t/foo.o,_Z3xyzR2B0j, \
240+
; RUN: -r=%t/foo.o,_ZN2B03barEj, \
241+
; RUN: -r=%t/foo.o,_ZN1B3barEj, \
242+
; RUN: -r=%t/main.o,_Z3fooR2B0j, \
243+
; RUN: -r=%t/main.o,_Znwm, \
244+
; RUN: -r=%t/main.o,_ZdlPvm, \
245+
; RUN: -r=%t/main.o,_Z8externalPi, \
246+
; RUN: -r=%t/main.o,main,plx \
247+
; RUN: -r=%t/main.o,_ZN2B03barEj,plx \
248+
; RUN: -r=%t/main.o,_ZN1B3barEj,plx \
249+
; RUN: -r=%t/main.o,_ZTV1B,plx \
250+
; RUN: -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \
251+
; RUN: -r=%t/main.o,_ZTS1B,plx \
252+
; RUN: -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \
253+
; RUN: -r=%t/main.o,_ZTS2B0,plx \
254+
; RUN: -r=%t/main.o,_ZTI2B0,plx \
255+
; RUN: -r=%t/main.o,_ZTI1B,plx \
256+
; RUN: -r=%t/main.o,_ZTV2B0,plx \
257+
; RUN: -thinlto-threads=1 \
258+
; RUN: -memprof-verify-ccg -memprof-verify-nodes \
259+
; RUN: -pass-remarks=. \
260+
; RUN: -o %t.out 2>&1 | FileCheck %s --implicit-check-not Promote
261+
187262
; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
188263
; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
189264
; REMARKS-MAIN: created clone _ZN2B03barEj.memprof.1
@@ -208,51 +283,59 @@
208283
; REMARKS-FOO: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN2B03barEj
209284
; REMARKS-FOO: Promote indirect call to _ZN2B03barEj with count 2 out of 2
210285
; REMARKS-FOO: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN2B03barEj.memprof.1
211-
; REMARKS-FOO: created clone _ZN2B03barEj.memprof.1
212-
; REMARKS-FOO: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
213-
; REMARKS-FOO: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
214-
; REMARKS-FOO: created clone _ZN1B3barEj.memprof.1
215-
; REMARKS-FOO: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
216-
; REMARKS-FOO: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
286+
; REMARKS-FOO-IMPORT: created clone _ZN2B03barEj.memprof.1
287+
; REMARKS-FOO-IMPORT: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
288+
; REMARKS-FOO-IMPORT: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
289+
; REMARKS-FOO-IMPORT: created clone _ZN1B3barEj.memprof.1
290+
; REMARKS-FOO-IMPORT: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
291+
; REMARKS-FOO-IMPORT: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
217292

218293
; STATS: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during whole program analysis
219294
; STATS-BE: 8 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
295+
; STATS-BE-NOIMPORT: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
220296
; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during whole program analysis
221297
; STATS-BE: 8 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
298+
; STATS-BE-NOIMPORT: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
222299
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
223300
; STATS-BE: 5 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
301+
; STATS-BE-NOIMPORT: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
224302

303+
; IR-NOIMPORT: foo
225304
; IR: define {{.*}} @_Z3fooR2B0j(
226-
; IR: %1 = icmp eq ptr %0, @_ZN1B3barEj
227-
; IR: br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect
305+
; IR: %[[R1:[0-9]+]] = icmp eq ptr %0, @_ZN1B3barEj
306+
; IR: br i1 %[[R1]], label %if.true.direct_targ, label %if.false.orig_indirect
228307
; IR: if.true.direct_targ:
229-
; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD:[0-9]+]]
308+
; IR-IMPORT: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD:[0-9]+]]
309+
; IR-NOIMPORT: call {{.*}} @_ZN1B3barEj(
230310
; IR: if.false.orig_indirect:
231-
; IR: %2 = icmp eq ptr %0, @_ZN2B03barEj
232-
; IR: br i1 %2, label %if.true.direct_targ1, label %if.false.orig_indirect2
311+
; IR: %[[R2:[0-9]+]] = icmp eq ptr %0, @_ZN2B03barEj
312+
; IR: br i1 %[[R2]], label %if.true.direct_targ1, label %if.false.orig_indirect2
233313
; IR: if.true.direct_targ1:
234-
; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD]]
314+
; IR-IMPORT: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD]]
315+
; IR-NOIMPORT: call {{.*}} @_ZN2B03barEj(
235316
; IR: if.false.orig_indirect2:
236317
; IR: call {{.*}} %0
237318

238319
; IR: define {{.*}} @_Z3fooR2B0j.memprof.1(
239320
;; We should still compare against the original versions of bar since that is
240321
;; what is in the vtable. However, we should have called the cloned versions
241322
;; that perform cold allocations, which were subsequently inlined.
242-
; IR: %1 = icmp eq ptr %0, @_ZN1B3barEj
243-
; IR: br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect
323+
; IR: %[[R3:[0-9]+]] = icmp eq ptr %0, @_ZN1B3barEj
324+
; IR: br i1 %[[R3]], label %if.true.direct_targ, label %if.false.orig_indirect
244325
; IR: if.true.direct_targ:
245-
; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD:[0-9]+]]
326+
; IR-IMPORT: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD:[0-9]+]]
327+
; IR-NOIMPORT: call {{.*}} @_ZN1B3barEj.memprof.1(
246328
; IR: if.false.orig_indirect:
247-
; IR: %2 = icmp eq ptr %0, @_ZN2B03barEj
248-
; IR: br i1 %2, label %if.true.direct_targ1, label %if.false.orig_indirect2
329+
; IR: %[[R4:[0-9]+]] = icmp eq ptr %0, @_ZN2B03barEj
330+
; IR: br i1 %[[R4]], label %if.true.direct_targ1, label %if.false.orig_indirect2
249331
; IR: if.true.direct_targ1:
250-
; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD]]
332+
; IR-IMPORT: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD]]
333+
; IR-NOIMPORT: call {{.*}} @_ZN2B03barEj.memprof.1(
251334
; IR: if.false.orig_indirect2:
252335
; IR: call {{.*}} %0
253336

254-
; IR: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold"
255-
; IR: attributes #[[COLD]] = {{.*}} "memprof"="cold"
337+
; IR-IMPORT: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold"
338+
; IR-IMPORT: attributes #[[COLD]] = {{.*}} "memprof"="cold"
256339

257340
; STATS-BE-DISTRIB: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
258341
; STATS-BE-DISTRIB: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
@@ -272,6 +355,9 @@ define i32 @_ZN2B03barEj.abc(ptr %this, i32 %s) {
272355
ret i32 0
273356
}
274357

358+
declare i32 @_ZN2B03barEj(ptr %this, i32 %s)
359+
declare i32 @_ZN1B3barEj(ptr %this, i32 %s)
360+
275361
define i32 @_Z3fooR2B0j(ptr %b) {
276362
entry:
277363
%0 = load ptr, ptr %b, align 8

0 commit comments

Comments
 (0)