Skip to content

Commit 4ca6e37

Browse files
committed
[CSSPGO] Overwrite branch weight annotated in previous pass.
Sample profile loader can be run in both LTO prelink and postlink. Currently the counts annoation in postilnk doesn't fully overwrite what's done in prelink. I'm adding a switch (`-overwrite-existing-weights=1`) to enable a full overwrite, which includes: 1. Clear old metadata for calls when their parent block has a zero count. This could be caused by prelink code duplication. 2. Clear indirect call metadata if somehow all the rest targets have a sum of zero count. 3. Overwrite branch weight for basic blocks. With a CS profile, I was seeing #1 and #2 help reduce code size by preventing post-sample ICP and CGSCC inliner working on obsolete metadata, which come from a partial global inlining in prelink. It's not expected to work well for non-CS case with a less-accurate post-inline count quality. It's worth calling out that some prelink optimizations can damage counts quality in an irreversible way. One example is the loop rotate optimization. Due to lack of exact loop entry count (profiling can only give loop iteration count and loop exit count), moving one iteration out of the loop body leaves the rest iteration count unknown. We had to turn off prelink loop rotate to achieve a better postlink counts quality. A even better postlink counts quality can be archived by turning off prelink CGSCC inlining which is not context-sensitive. Reviewed By: wenlei, wmi Differential Revision: https://reviews.llvm.org/D102537
1 parent 5178574 commit 4ca6e37

File tree

3 files changed

+108
-13
lines changed

3 files changed

+108
-13
lines changed

llvm/lib/Transforms/IPO/SampleProfile.cpp

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,6 @@ static cl::opt<bool> CallsitePrioritizedInline(
208208
cl::desc("Use call site prioritized inlining for sample profile loader."
209209
"Currently only CSSPGO is supported."));
210210

211-
212211
static cl::opt<std::string> ProfileInlineReplayFile(
213212
"sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
214213
cl::desc(
@@ -222,6 +221,10 @@ static cl::opt<unsigned>
222221
cl::desc("Max number of promotions for a single indirect "
223222
"call callsite in sample profile loader"));
224223

224+
static cl::opt<bool> OverwriteExistingWeights(
225+
"overwrite-existing-weights", cl::Hidden, cl::init(false),
226+
cl::desc("Ignore existing branch weights on IR and always overwrite."));
227+
225228
namespace {
226229

227230
using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
@@ -1453,9 +1456,10 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
14531456
auto T = FS->findCallTargetMapAt(CallSite);
14541457
if (!T || T.get().empty())
14551458
continue;
1456-
// Prorate the callsite counts to reflect what is already done to the
1457-
// callsite, such as ICP or calliste cloning.
14581459
if (FunctionSamples::ProfileIsProbeBased) {
1460+
// Prorate the callsite counts based on the pre-ICP distribution
1461+
// factor to reflect what is already done to the callsite before
1462+
// ICP, such as calliste cloning.
14591463
if (Optional<PseudoProbe> Probe = extractProbe(I)) {
14601464
if (Probe->Factor < 1)
14611465
T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
@@ -1476,16 +1480,29 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
14761480
Sum += NameFS.second.getEntrySamples();
14771481
}
14781482
}
1479-
if (!Sum)
1480-
continue;
1481-
updateIDTMetaData(I, SortedCallTargets, Sum);
1483+
if (Sum)
1484+
updateIDTMetaData(I, SortedCallTargets, Sum);
1485+
else if (OverwriteExistingWeights)
1486+
I.setMetadata(LLVMContext::MD_prof, nullptr);
14821487
} else if (!isa<IntrinsicInst>(&I)) {
14831488
I.setMetadata(LLVMContext::MD_prof,
14841489
MDB.createBranchWeights(
14851490
{static_cast<uint32_t>(BlockWeights[BB])}));
14861491
}
14871492
}
1493+
} else if (OverwriteExistingWeights) {
1494+
// Set profile metadata (possibly annotated by LTO prelink) to zero or
1495+
// clear it for cold code.
1496+
for (auto &I : BB->getInstList()) {
1497+
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1498+
if (cast<CallBase>(I).isIndirectCall())
1499+
I.setMetadata(LLVMContext::MD_prof, nullptr);
1500+
else
1501+
I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(0));
1502+
}
1503+
}
14881504
}
1505+
14891506
Instruction *TI = BB->getTerminator();
14901507
if (TI->getNumSuccessors() == 1)
14911508
continue;
@@ -1527,20 +1544,28 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
15271544
uint64_t TempWeight;
15281545
// Only set weights if there is at least one non-zero weight.
15291546
// In any other case, let the analyzer set weights.
1530-
// Do not set weights if the weights are present. In ThinLTO, the profile
1531-
// annotation is done twice. If the first annotation already set the
1532-
// weights, the second pass does not need to set it.
1533-
if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
1547+
// Do not set weights if the weights are present unless under
1548+
// OverwriteExistingWeights. In ThinLTO, the profile annotation is done
1549+
// twice. If the first annotation already set the weights, the second pass
1550+
// does not need to set it. With OverwriteExistingWeights, Blocks with zero
1551+
// weight should have their existing metadata (possibly annotated by LTO
1552+
// prelink) cleared.
1553+
if (MaxWeight > 0 &&
1554+
(!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
15341555
LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1535-
TI->setMetadata(LLVMContext::MD_prof,
1536-
MDB.createBranchWeights(Weights));
1556+
TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
15371557
ORE->emit([&]() {
15381558
return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
15391559
<< "most popular destination for conditional branches at "
15401560
<< ore::NV("CondBranchesLoc", BranchLoc);
15411561
});
15421562
} else {
1543-
LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1563+
if (OverwriteExistingWeights) {
1564+
TI->setMetadata(LLVMContext::MD_prof, nullptr);
1565+
LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");
1566+
} else {
1567+
LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1568+
}
15441569
}
15451570
}
15461571
}

llvm/test/Transforms/SampleProfile/branch.ll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/branch.prof | opt -analyze -branch-prob -enable-new-pm=0 | FileCheck %s
22
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/branch.prof | opt -passes='print<branch-prob>' -disable-output 2>&1 | FileCheck %s
3+
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/branch.prof -overwrite-existing-weights=1 | opt -passes='print<branch-prob>' -disable-output 2>&1 | FileCheck %s --check-prefix=OVW
34

45
; Original C++ code for this test case:
56
;
@@ -90,6 +91,8 @@ for.cond: ; preds = %for.inc, %if.then.2
9091
br i1 %cmp5, label %for.body, label %for.end, !dbg !50, !prof !80
9192
; CHECK: edge for.cond -> for.body probability is 0x73333333 / 0x80000000 = 90.00%
9293
; CHECK: edge for.cond -> for.end probability is 0x0ccccccd / 0x80000000 = 10.00%
94+
; OVW: edge for.cond -> for.body probability is 0x76b3f3be / 0x80000000 = 92.74%
95+
; OVW: edge for.cond -> for.end probability is 0x094c0c42 / 0x80000000 = 7.26%
9396

9497
for.body: ; preds = %for.cond
9598
call void @llvm.dbg.declare(metadata double* %x, metadata !51, metadata !17), !dbg !53
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-profile.prof -pass-remarks=sample-profile -S | FileCheck %s
2+
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-profile.prof -pass-remarks=sample-profile -overwrite-existing-weights=1 -S | FileCheck %s -check-prefix=OVW
3+
4+
define dso_local i32 @foo(i32 %x, void (i32)* %f) #0 !dbg !4 !prof !10 {
5+
entry:
6+
%retval = alloca i32, align 4
7+
%x.addr = alloca i32, align 4
8+
store i32 %x, i32* %x.addr, align 4
9+
%0 = load i32, i32* %x.addr, align 4
10+
%cmp = icmp eq i32 %0, 0
11+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 1, i32 0, i64 -1)
12+
br i1 %cmp, label %if.then, label %if.else, !prof !11
13+
14+
if.then:
15+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1)
16+
; CHECK: call {{.*}}, !dbg ![[#]], !prof ![[#PROF:]]
17+
; OVW: call {{.*}}, !dbg ![[#]], !prof ![[#PROF:]]
18+
call void %f(i32 1), !dbg !13, !prof !16
19+
store i32 1, i32* %retval, align 4
20+
br label %return
21+
22+
if.else:
23+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 0)
24+
; CHECK: call {{.*}}, !dbg ![[#]], !prof ![[#PROF]]
25+
;; The block should have a 0 weight. Check the profile metadata is dropped.
26+
; OVW-NOT: call {{.*}}, !dbg ![[#]], !prof
27+
call void %f(i32 2), !dbg !15, !prof !16
28+
store i32 2, i32* %retval, align 4
29+
br label %return
30+
31+
return:
32+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1)
33+
%1 = load i32, i32* %retval, align 4
34+
ret i32 %1
35+
}
36+
37+
; CHECK: ![[#PROF]] = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
38+
; OVW: ![[#PROF]] = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
39+
40+
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
41+
42+
attributes #0 = {"use-sample-profile"}
43+
44+
!llvm.module.flags = !{!0, !1}
45+
!llvm.pseudo_probe_desc = !{!2}
46+
47+
!0 = !{i32 7, !"Dwarf Version", i32 4}
48+
!1 = !{i32 2, !"Debug Info Version", i32 3}
49+
!2 = !{i64 6699318081062747564, i64 563022570642068, !"foo", null}
50+
!4 = distinct !DISubprogram(name: "foo", scope: !5, file: !5, line: 9, type: !6, scopeLine: 9, spFlags: DISPFlagDefinition, unit: !9)
51+
!5 = !DIFile(filename: "test.cpp", directory: "test")
52+
!6 = !DISubroutineType(types: !7)
53+
!7 = !{!8, !8}
54+
!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
55+
!9 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !5, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
56+
!10 = !{!"function_entry_count", i64 14}
57+
!11 = !{!"branch_weights", i32 100, i32 0}
58+
;; A discriminator of 186646575 which is 0x6f80057 in hexdecimal, stands for an indirect call probe
59+
;; with an index of 5 and probe factor of 1.0.
60+
!12 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646575)
61+
!13 = distinct !DILocation(line: 10, column: 11, scope: !12)
62+
;; A discriminator of 134217775 which is 0x6f80057 in hexdecimal, stands for an indirect call probe
63+
;; with an index of 5 and probe factor of 0.
64+
!14 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 134217775)
65+
!15 = distinct !DILocation(line: 10, column: 11, scope: !14)
66+
!16 = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
67+

0 commit comments

Comments
 (0)