Skip to content

Commit 93b74f7

Browse files
authored
[ctxprof] Scale up everything under a root by its TotalRootEntryCount (#136015)
`TotalRootEntryCount` captures how many times that root was entered - regardless if a profile was also collected or not (profile collection for a given root happens on only one thread at a time). We don't do this in compiler_rt because the goal there is to flush out the data as fast as possible, so traversing and multiplying vectors is punted to the profile user. We really just need to do this when flattening the profile so that the values across roots and flat profiles match. We could do it earlier, too - like when loading the profile - but it seems beneficial (at least for debugging) to keep the counter values the same as the loaded ones. We can revisit this later.
1 parent 52a5332 commit 93b74f7

File tree

8 files changed

+66
-51
lines changed

8 files changed

+66
-51
lines changed

llvm/lib/Analysis/CtxProfAnalysis.cpp

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -621,17 +621,23 @@ CtxProfAnalysis::getSelectInstrumentation(SelectInst &SI) {
621621
return nullptr;
622622
}
623623

624-
template <class ProfilesTy, class ProfTy>
625-
static void preorderVisit(ProfilesTy &Profiles,
626-
function_ref<void(ProfTy &)> Visitor) {
624+
template <class ProfTy>
625+
static void preorderVisitOneRoot(ProfTy &Profile,
626+
function_ref<void(ProfTy &)> Visitor) {
627627
std::function<void(ProfTy &)> Traverser = [&](auto &Ctx) {
628628
Visitor(Ctx);
629629
for (auto &[_, SubCtxSet] : Ctx.callsites())
630630
for (auto &[__, Subctx] : SubCtxSet)
631631
Traverser(Subctx);
632632
};
633+
Traverser(Profile);
634+
}
635+
636+
template <class ProfilesTy, class ProfTy>
637+
static void preorderVisit(ProfilesTy &Profiles,
638+
function_ref<void(ProfTy &)> Visitor) {
633639
for (auto &[_, P] : Profiles)
634-
Traverser(P);
640+
preorderVisitOneRoot<ProfTy>(P, Visitor);
635641
}
636642

637643
void PGOContextualProfile::initIndex() {
@@ -683,40 +689,47 @@ void PGOContextualProfile::visit(ConstVisitor V, const Function *F) const {
683689
const CtxProfFlatProfile PGOContextualProfile::flatten() const {
684690
CtxProfFlatProfile Flat;
685691
auto Accummulate = [](SmallVectorImpl<uint64_t> &Into,
686-
const SmallVectorImpl<uint64_t> &From) {
692+
const SmallVectorImpl<uint64_t> &From,
693+
uint64_t SamplingRate) {
687694
if (Into.empty())
688695
Into.resize(From.size());
689696
assert(Into.size() == From.size() &&
690697
"All contexts corresponding to a function should have the exact "
691698
"same number of counters.");
692699
for (size_t I = 0, E = Into.size(); I < E; ++I)
693-
Into[I] += From[I];
700+
Into[I] += From[I] * SamplingRate;
694701
};
695702

696-
preorderVisit<const PGOCtxProfContext::CallTargetMapTy,
697-
const PGOCtxProfContext>(
698-
Profiles.Contexts, [&](const PGOCtxProfContext &Ctx) {
699-
Accummulate(Flat[Ctx.guid()], Ctx.counters());
700-
});
701-
for (const auto &[_, RC] : Profiles.Contexts)
702-
for (const auto &[G, Unh] : RC.getUnhandled())
703-
Accummulate(Flat[G], Unh);
703+
for (const auto &[_, CtxRoot] : Profiles.Contexts) {
704+
const uint64_t SamplingFactor = CtxRoot.getTotalRootEntryCount();
705+
preorderVisitOneRoot<const PGOCtxProfContext>(
706+
CtxRoot, [&](const PGOCtxProfContext &Ctx) {
707+
Accummulate(Flat[Ctx.guid()], Ctx.counters(), SamplingFactor);
708+
});
709+
710+
for (const auto &[G, Unh] : CtxRoot.getUnhandled())
711+
Accummulate(Flat[G], Unh, SamplingFactor);
712+
}
713+
// We don't sample "Flat" currently, so sampling rate is 1.
704714
for (const auto &[G, FC] : Profiles.FlatProfiles)
705-
Accummulate(Flat[G], FC);
715+
Accummulate(Flat[G], FC, /*SamplingRate=*/1);
706716
return Flat;
707717
}
708718

709719
const CtxProfFlatIndirectCallProfile
710720
PGOContextualProfile::flattenVirtCalls() const {
711721
CtxProfFlatIndirectCallProfile Ret;
712-
preorderVisit<const PGOCtxProfContext::CallTargetMapTy,
713-
const PGOCtxProfContext>(
714-
Profiles.Contexts, [&](const PGOCtxProfContext &Ctx) {
715-
auto &Targets = Ret[Ctx.guid()];
716-
for (const auto &[ID, SubctxSet] : Ctx.callsites())
717-
for (const auto &Subctx : SubctxSet)
718-
Targets[ID][Subctx.first] += Subctx.second.getEntrycount();
719-
});
722+
for (const auto &[_, CtxRoot] : Profiles.Contexts) {
723+
const uint64_t TotalRootEntryCount = CtxRoot.getTotalRootEntryCount();
724+
preorderVisitOneRoot<const PGOCtxProfContext>(
725+
CtxRoot, [&](const PGOCtxProfContext &Ctx) {
726+
auto &Targets = Ret[Ctx.guid()];
727+
for (const auto &[ID, SubctxSet] : Ctx.callsites())
728+
for (const auto &Subctx : SubctxSet)
729+
Targets[ID][Subctx.first] +=
730+
Subctx.second.getEntrycount() * TotalRootEntryCount;
731+
});
732+
}
720733
return Ret;
721734
}
722735

llvm/test/Analysis/CtxProfAnalysis/flatten-and-annotate.ll

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@
2323
; PRELINK-LABEL: yes:
2424
; PRELINK-NEXT: call void @llvm.instrprof.increment(ptr @an_entrypoint, i64 [[#]], i32 2, i32 1)
2525
; PRELINK: ![[#]] = !{i32 1, !"ProfileSummary", !1}
26-
; PRELINK: ![[#]] = !{!"TotalCount", i64 3595}
27-
; PRELINK: ![[#]] = !{!"MaxCount", i64 3000}
28-
; PRELINK: ![[#]] = !{!"MaxInternalCount", i64 3000}
29-
; PRELINK: ![[#]] = !{!"MaxFunctionCount", i64 300}
26+
; PRELINK: ![[#]] = !{!"TotalCount", i64 151600}
27+
; PRELINK: ![[#]] = !{!"MaxCount", i64 102000}
28+
; PRELINK: ![[#]] = !{!"MaxInternalCount", i64 102000}
29+
; PRELINK: ![[#]] = !{!"MaxFunctionCount", i64 20100}
3030
; PRELINK: ![[#]] = !{!"NumCounts", i64 6}
3131
; PRELINK: ![[#]] = !{!"NumFunctions", i64 3}
32-
; PRELINK: ![[PREPROF]] = !{!"branch_weights", i32 40, i32 60}
32+
; PRELINK: ![[PREPROF]] = !{!"branch_weights", i32 4000, i32 6000}
3333

3434
; Check that the output has:
3535
; - no instrumentation
@@ -49,25 +49,25 @@
4949
; The postlink summary is restricted to the stuff under the root - including the
5050
; "unhandled" data.
5151
; POSTLINK: ![[#]] = !{i32 1, !"ProfileSummary", !1}
52-
; POSTLINK: ![[#]] = !{!"TotalCount", i64 1495}
53-
; POSTLINK: ![[#]] = !{!"MaxCount", i64 1000}
54-
; POSTLINK: ![[#]] = !{!"MaxInternalCount", i64 1000}
55-
; POSTLINK: ![[#]] = !{!"MaxFunctionCount", i64 200}
52+
; POSTLINK: ![[#]] = !{!"TotalCount", i64 149500}
53+
; POSTLINK: ![[#]] = !{!"MaxCount", i64 100000}
54+
; POSTLINK: ![[#]] = !{!"MaxInternalCount", i64 100000}
55+
; POSTLINK: ![[#]] = !{!"MaxFunctionCount", i64 20000}
5656
; POSTLINK: ![[#]] = !{!"NumCounts", i64 6}
5757
; POSTLINK: ![[#]] = !{!"NumFunctions", i64 3}
5858

5959
;
6060
; @foo will be called both unconditionally and conditionally, on the "yes" branch
6161
; which has a count of 40. So 140 times.
6262

63-
; POSTLINK: ![[FOO_EP]] = !{!"function_entry_count", i64 140}
63+
; POSTLINK: ![[FOO_EP]] = !{!"function_entry_count", i64 14000}
6464

6565
; foo's "no" branch is taken 10+5 times (from the 2 contexts belonging to foo).
6666
; Which means its "yes" branch is taken 140 - 15 times.
6767

68-
; POSTLINK: ![[FOO_BW]] = !{!"branch_weights", i32 125, i32 15}
69-
; POSTLINK: ![[AN_ENTRYPOINT_EP]] = !{!"function_entry_count", i64 100}
70-
; POSTLINK: ![[AN_ENTRYPOINT_BW]] = !{!"branch_weights", i32 40, i32 60}
68+
; POSTLINK: ![[FOO_BW]] = !{!"branch_weights", i32 12500, i32 1500}
69+
; POSTLINK: ![[AN_ENTRYPOINT_EP]] = !{!"function_entry_count", i64 10000}
70+
; POSTLINK: ![[AN_ENTRYPOINT_BW]] = !{!"branch_weights", i32 4000, i32 6000}
7171

7272
;--- profile.yaml
7373
Contexts:

llvm/test/Analysis/CtxProfAnalysis/flatten-check-path.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313

1414
; CHECK: br i1 %x, label %b1, label %exit, !prof ![[PROF1:[0-9]+]]
1515
; CHECK: br i1 %y, label %blk, label %exit, !prof ![[PROF2:[0-9]+]]
16-
; CHECK: ![[PROF1]] = !{!"branch_weights", i32 1, i32 1}
17-
; CHECK: ![[PROF2]] = !{!"branch_weights", i32 0, i32 1}
16+
; CHECK: ![[PROF1]] = !{!"branch_weights", i32 2, i32 2}
17+
; CHECK: ![[PROF2]] = !{!"branch_weights", i32 0, i32 2}
1818
; ASSERTION: Assertion `allTakenPathsExit()
1919

2020
; b1->exit is the only way out from b1, but the exit block would have been

llvm/test/Analysis/CtxProfAnalysis/flatten-insert-icp-mdprof.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
; PRELINK-NEXT: call void %p(), !prof ![[VPPROF:[0-9]+]]
1212
; PRELINK-NEXT: call void @llvm.instrprof.callsite(ptr @foo, i64 1234, i32 2, i32 1, ptr @bar)
1313
; PRELINK-NEXT: call void @bar(){{$}}
14-
; PRELINK: ![[VPPROF]] = !{!"VP", i32 0, i64 5, i64 5678, i64 4, i64 5555, i64 1}
14+
; PRELINK: ![[VPPROF]] = !{!"VP", i32 0, i64 25, i64 5678, i64 20, i64 5555, i64 5}
1515

1616
; RUN: cp %t/example.ll %t/1234.ll
1717
; RUN: opt -passes=ctx-prof-flatten %t/1234.ll -use-ctx-profile=%t/profile.ctxprofdata \

llvm/test/Analysis/CtxProfAnalysis/flatten-zero-path.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
; CHECK-LABEL: yes:
1515
; CHECK: br i1 %t3, label %yes1, label %yes2, !prof ![[C1]]
1616
; CHECK-NOT: !prof
17-
; CHECK: ![[C1]] = !{!"branch_weights", i32 6, i32 0}
17+
; CHECK: ![[C1]] = !{!"branch_weights", i32 72, i32 0}
1818

1919
;--- 1234.ll
2020
define void @f1(i32 %cond) !guid !0 {

llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ Contexts:
108108
Counters: [ 1, 2 ]
109109

110110
Flat Profile:
111-
2072045998141807037 : 7
112-
3087265239403591524 : 11 9
113-
4197650231481825559 : 2
114-
10507721908651011566 : 1
111+
2072045998141807037 : 70
112+
3087265239403591524 : 110 90
113+
4197650231481825559 : 20
114+
10507721908651011566 : 10

llvm/test/Analysis/CtxProfAnalysis/inline.ll

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,13 @@
4444
; PIPELINE-LABEL: loop:
4545
; PIPELINE: br i1 %cond, label %loop, label %exit, !prof ![[LOOP_BW_ORIG:[0-9]+]]
4646

47-
; PIPELINE: ![[ENTRYPOINT_COUNT]] = !{!"function_entry_count", i64 10}
47+
; *Note* that all values are multiplied by the TotalRootEntryCount, which is 24
48+
;
49+
; PIPELINE: ![[ENTRYPOINT_COUNT]] = !{!"function_entry_count", i64 240}
4850
; These are the weights of the inlined @a, where the counters were 2, 100 (2 for entry, 100 for loop)
49-
; PIPELINE: ![[LOOP_BW_INL]] = !{!"branch_weights", i32 98, i32 2}
51+
; PIPELINE: ![[LOOP_BW_INL]] = !{!"branch_weights", i32 2352, i32 48}
5052
; These are the weights of the un-inlined @a, where the counters were 8, 500 (8 for entry, 500 for loop)
51-
; PIPELINE: ![[LOOP_BW_ORIG]] = !{!"branch_weights", i32 492, i32 8}
53+
; PIPELINE: ![[LOOP_BW_ORIG]] = !{!"branch_weights", i32 11808, i32 192}
5254

5355
;--- 1000.ll
5456
define i32 @entrypoint(i32 %x) !guid !0 {

llvm/test/Analysis/CtxProfAnalysis/load.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,10 @@ Contexts:
6060
Counters: [ 5 ]
6161

6262
Flat Profile:
63-
12341 : 9
64-
728453322856651412 : 6 7
65-
11872291593386833696 : 1
66-
12074870348631550642 : 5
63+
12341 : 810
64+
728453322856651412 : 24 28
65+
11872291593386833696 : 4
66+
12074870348631550642 : 120
6767
;--- example.ll
6868
declare void @bar()
6969

0 commit comments

Comments
 (0)